diff --git a/UI/__pycache__/gradio.cpython-310.pyc b/UI/__pycache__/gradio.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e268c233039b5025d099cbfab83b8e032b20507a
Binary files /dev/null and b/UI/__pycache__/gradio.cpython-310.pyc differ
diff --git a/UI/__pycache__/gradio.cpython-311.pyc b/UI/__pycache__/gradio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2de60032ba3110fc885333ad0734831e2570a162
Binary files /dev/null and b/UI/__pycache__/gradio.cpython-311.pyc differ
diff --git a/UI/gradio.py b/UI/gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec1b35b24ea316265bccd78fe16548988499a75
--- /dev/null
+++ b/UI/gradio.py
@@ -0,0 +1,516 @@
+import gradio as gr
+from utils.watermark import Watermarker
+from utils.config import load_config
+from renderers.highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
+from renderers.tree import generate_subplot1, generate_subplot2
+from pathlib import Path
+import time
+from typing import Dict, List, Tuple, Any
+import plotly.graph_objects as go
+
+class WatermarkerInterface:
+ def __init__(self, config):
+
+ self.pipeline = Watermarker(config)
+ self.common_grams = {}
+ self.highlight_info = []
+ self.masked_sentences = []
+
+ def handle_paraphrase(self, prompt: str) -> Tuple[str, str, str, str]:
+ """Wrapper for paraphrasing that includes highlighting"""
+ start_time = time.time()
+
+ # Run paraphrasing
+ self.pipeline.Paraphrase(prompt)
+
+ # Step 1: Process the original sentence first
+ seen_ngrams = {} # Stores first occurrence index of each n-gram
+ original_indexed_ngrams = [] # Final indexed list for original
+
+ original_sentence = self.pipeline.user_prompt
+ original_ngrams = self.pipeline.common_grams.get(original_sentence, {})
+
+ # Step 1.1: Extract n-grams and their first occurrence index
+ ngram_occurrences = [
+ (min(indices, key=lambda x: x[0])[0], gram) # Get first index
+ for gram, indices in original_ngrams.items()
+ ]
+
+ # Step 1.2: Sort n-grams based on their first occurrence
+ ngram_occurrences.sort()
+
+ # Step 1.3: Assign sequential indices
+ for idx, (position, gram) in enumerate(ngram_occurrences, start=1):
+ seen_ngrams[gram] = idx # Assign sequential index
+ original_indexed_ngrams.append((idx, gram))
+
+ print("Original Indexed N-grams:", original_indexed_ngrams)
+
+ #generate highlight_info
+ colors = ["red", "blue", "green", "purple", "orange"]
+ highlight_info = [
+ (ngram, colors[i % len(colors)])
+ for i, (index, ngram) in enumerate(original_indexed_ngrams)
+ ]
+ common_grams = original_indexed_ngrams
+ self.highlight_info = highlight_info
+ self.common_grams = common_grams
+
+ # Step 2: Process paraphrased sentences and match indices
+ paraphrase_indexed_ngrams = {}
+
+ for sentence in self.pipeline.paraphrased_sentences:
+ sentence_ngrams = [] # Stores n-grams for this sentence
+ sentence_ngrams_dict = self.pipeline.common_grams.get(sentence, {})
+
+ for gram, indices in sentence_ngrams_dict.items():
+ first_occurrence = min(indices, key=lambda x: x[0])[0]
+
+ # Use the original's index if exists, otherwise assign a new one
+ if gram in seen_ngrams:
+ index = seen_ngrams[gram] # Use the same index as original
+ else:
+ index = len(seen_ngrams) + 1 # Assign new index
+ seen_ngrams[gram] = index # Store it
+
+ sentence_ngrams.append((index, gram))
+
+ sentence_ngrams.sort()
+ paraphrase_indexed_ngrams[sentence] = sentence_ngrams
+
+ print("Paraphrase Indexed N-grams:", paraphrase_indexed_ngrams)
+
+ # Step 3: Generate highlighted versions using the renderer
+ highlighted_prompt = highlight_common_words(
+ common_grams,
+ [self.pipeline.user_prompt],
+ "Original Prompt with Highlighted Common Sequences"
+ )
+
+ highlighted_accepted = highlight_common_words_dict(
+ common_grams,
+ self.pipeline.selected_sentences,
+ "Accepted Paraphrased Sentences with Entailment Scores"
+ )
+
+ highlighted_discarded = highlight_common_words_dict(
+ common_grams,
+ self.pipeline.discarded_sentences,
+ "Discarded Paraphrased Sentences with Entailment Scores"
+ )
+
+ execution_time = f"
Step 1 completed in {time.time() - start_time:.2f} seconds
"
+ self.highlight_info = highlight_info
+ self.common_grams = common_grams
+
+ return highlighted_prompt, highlighted_accepted, highlighted_discarded, execution_time
+
+ def handle_masking(self) -> Tuple[List[go.Figure], str]:
+ """Wrapper for masking that generates visualization trees"""
+ start_time = time.time()
+
+ masking_results = self.pipeline.Masking()
+ trees = []
+ highlight_info = self.highlight_info
+ common_grams = self.common_grams
+ sentence_to_masked = {}
+
+ # Create a consolidated figure with all strategies
+ original_sentence = None
+
+ # First pass - gather all sentences and strategies
+ for strategy, sentence_dict in masking_results.items():
+ for sent, data in sentence_dict.items():
+ if sent not in sentence_to_masked:
+ sentence_to_masked[sent] = []
+ try:
+ if not isinstance(data, dict):
+ print(f"[ERROR] Data is not a dictionary for {sent} with strategy {strategy}")
+ continue
+
+ masked_sentence = data.get("masked_sentence", "")
+ if masked_sentence:
+ sentence_to_masked[sent].append((masked_sentence, strategy))
+ except Exception as e:
+ print(f"Error processing {strategy} for sentence {sent}: {e}")
+
+ for original_sentence, masked_sentences_data in sentence_to_masked.items():
+ if not masked_sentences_data:
+ continue
+ masked_sentences = [ms[0] for ms in masked_sentences_data]
+ strategies = [ms[1] for ms in masked_sentences_data]
+ try:
+
+ fig = generate_subplot1(
+ original_sentence,
+ masked_sentences,
+ strategies,
+ highlight_info,
+ common_grams
+ )
+ trees.append(fig)
+ except Exception as e:
+ print(f"Error generating multi-strategy tree: {e}")
+ trees.append(go.Figure())
+
+ # Pad with empty plots if needed
+ while len(trees) < 10:
+ trees.append(go.Figure())
+
+ execution_time = f"Step 2 completed in {time.time() - start_time:.2f} seconds
"
+
+ return trees[:10] + [execution_time]
+
+ def handle_sampling(self) -> Tuple[List[go.Figure], str]:
+ """Wrapper for sampling that generates visualization trees"""
+ start_time = time.time()
+ sampling_results = self.pipeline.Sampling()
+ trees = []
+
+ # Group sentences by original sentence
+ organized_results = {}
+
+ # Generate trees for each sampled sentence
+ for sampling_strategy, masking_dict in sampling_results.items():
+ for masking_strategy, sentences in masking_dict.items():
+ for original_sentence, data in sentences.items():
+ if original_sentence not in organized_results:
+ organized_results[original_sentence] = {}
+
+ if masking_strategy not in organized_results[original_sentence]:
+ organized_results[original_sentence][masking_strategy] = {
+ "masked_sentence": data.get("masked_sentence", ""), # Corrected reference
+ "sampled_sentences": {}
+ }
+
+ # Add this sampling result
+ organized_results[original_sentence][masking_strategy]["sampled_sentences"][sampling_strategy] = data.get("sampled_sentence", "")
+
+ for original_sentence, data in organized_results.items():
+ masked_sentences = []
+ all_sampled_sentences = []
+
+ for masking_strategy, masking_data in list(data.items())[:3]: # Ensure this iteration is safe
+ masked_sentence = masking_data.get("masked_sentence", "")
+ if masked_sentence:
+ masked_sentences.append(masked_sentence)
+
+ for sampling_strategy, sampled_sentence in masking_data.get("sampled_sentences", {}).items():
+ if sampled_sentence:
+ all_sampled_sentences.append(sampled_sentence)
+
+ if masked_sentences:
+ try:
+ fig = generate_subplot2(
+ masked_sentences,
+ all_sampled_sentences,
+ self.highlight_info,
+ self.common_grams
+ )
+ trees.append(fig)
+ except Exception as e:
+ print(f"Error generating subplot for {original_sentence}: {e}")
+ trees.append(go.Figure())
+
+ while len(trees) < 10:
+ trees.append(go.Figure())
+
+ execution_time = f"Step 3 completed in {time.time() - start_time:.2f} seconds
"
+
+ return trees[:10] + [execution_time]
+
+ def handle_reparaphrasing(self) -> Tuple[List[str], str]:
+ """Wrapper for re-paraphrasing that formats results as HTML"""
+ start_time = time.time()
+
+ results = self.pipeline.re_paraphrasing()
+ html_outputs = []
+
+ # Generate HTML for each batch of re-paraphrased sentences
+ for sampling_strategy, masking_dict in results.items():
+ for masking_strategy, sentences in masking_dict.items():
+ for original_sent, data in sentences.items():
+ if data["re_paraphrased_sentences"]:
+ html = reparaphrased_sentences_html(data["re_paraphrased_sentences"])
+ html_outputs.append(html)
+
+ # Pad with empty HTML if needed
+ while len(html_outputs) < 120:
+ html_outputs.append("")
+
+ execution_time = f"Step 4 completed in {time.time() - start_time:.2f} seconds
"
+
+ return html_outputs[:120] + [execution_time]
+
+
+def create_gradio_interface(config):
+ """Creates the Gradio interface with the updated pipeline"""
+ interface = WatermarkerInterface(config)
+
+ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+ #CSS to enable scrolling for reparaphrased sentences and sampling plots
+ demo.css = """
+/* Set fixed height for the reparaphrased tabs container only */
+.gradio-container .tabs[id="reparaphrased-tabs"],
+.gradio-container .tabs[id="sampling-tabs"] {
+ overflow-x: hidden;
+ white-space: normal;
+ border-radius: 8px;
+ max-height: 600px; /* Set fixed height for the entire tabs component */
+ overflow-y: auto; /* Enable vertical scrolling inside the container */
+}
+
+/* Tab content styling for reparaphrased and sampling tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tabitem,
+.gradio-container .tabs[id="sampling-tabs"] .tabitem {
+ overflow-x: hidden;
+ white-space: normal;
+ display: block;
+ border-radius: 8px;
+}
+
+/* Make the tab navigation fixed at the top for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav {
+ display: flex;
+ overflow-x: auto;
+ white-space: nowrap;
+ scrollbar-width: thin;
+ border-radius: 8px;
+ scrollbar-color: #888 #f1f1f1;
+ position: sticky;
+ top: 0;
+ background: white;
+ z-index: 100;
+}
+
+/* Dropdown menu for scrollable tabs styling */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown {
+ position: relative;
+ display: inline-block;
+}
+
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content {
+ display: none;
+ position: absolute;
+ background-color: #f9f9f9;
+ min-width: 160px;
+ box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+ z-index: 1;
+ max-height: 300px;
+ overflow-y: auto;
+}
+
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content {
+ display: block;
+}
+
+/* Scrollbar styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar {
+ height: 8px;
+ border-radius: 8px;
+}
+
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-track,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-track {
+ background: #f1f1f1;
+ border-radius: 8px;
+}
+
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-thumb,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-thumb {
+ background: #888;
+ border-radius: 8px;
+}
+
+/* Tab button styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-item,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-item {
+ flex: 0 0 auto;
+ border-radius: 8px;
+}
+
+/* Plot container styling specifically for sampling tabs */
+.gradio-container .tabs[id="sampling-tabs"] .plot-container {
+ min-height: 600px;
+ max-height: 1800px;
+ overflow-y: auto;
+}
+
+/* Ensure text wraps in HTML components */
+.gradio-container .prose {
+ white-space: normal;
+ word-wrap: break-word;
+ overflow-wrap: break-word;
+}
+
+/* Dropdown button styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button {
+ background-color: #f0f0f0;
+ border: 1px solid #ddd;
+ border-radius: 4px;
+ padding: 5px 10px;
+ cursor: pointer;
+ margin: 2px;
+}
+
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button:hover,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button:hover {
+ background-color: #e0e0e0;
+}
+
+/* Style dropdown content items for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div {
+ padding: 8px 12px;
+ cursor: pointer;
+}
+
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div:hover,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div:hover {
+ background-color: #e0e0e0;
+}
+
+/* Custom styling for execution time display */
+.execution-time {
+ text-align: right;
+ padding: 8px 16px;
+ font-family: inherit;
+ color: #555;
+ font-size: 0.9rem;
+ font-style: italic;
+ margin-left: auto;
+ width: 100%;
+ border-top: 1px solid #eee;
+ margin-top: 8px;
+}
+
+/* Layout for section headers with execution time */
+.section-header {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ width: 100%;
+ margin-bottom: 12px;
+}
+
+.section-header h3 {
+ margin: 0;
+}
+"""
+ gr.Markdown("# **AIISC Watermarking Model**")
+
+ with gr.Column():
+ gr.Markdown("## Input Prompt")
+ user_input = gr.Textbox(
+ label="Enter Your Prompt",
+ placeholder="Type your text here..."
+ )
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ gr.Markdown("## Step 1: Paraphrasing, LCS and Entailment Analysis")
+ with gr.Column(scale=1):
+ step1_time = gr.HTML()
+
+ paraphrase_button = gr.Button("Generate Paraphrases")
+ highlighted_user_prompt = gr.HTML(label="Highlighted User Prompt")
+
+ with gr.Tabs():
+ with gr.TabItem("Accepted Paraphrased Sentences"):
+ highlighted_accepted_sentences = gr.HTML()
+ with gr.TabItem("Discarded Paraphrased Sentences"):
+ highlighted_discarded_sentences = gr.HTML()
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ gr.Markdown("## Step 2: Where to Mask?")
+ with gr.Column(scale=1):
+ step2_time = gr.HTML()
+
+ masking_button = gr.Button("Apply Masking")
+ gr.Markdown("### Masked Sentence Trees")
+ tree1_plots = []
+ with gr.Tabs() as tree1_tabs:
+ for i in range(10):
+ with gr.TabItem(f"Masked Sentence {i+1}"):
+ tree1 = gr.Plot()
+ tree1_plots.append(tree1)
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ gr.Markdown("## Step 3: How to Mask?")
+ with gr.Column(scale=1):
+ step3_time = gr.HTML()
+
+ sampling_button = gr.Button("Sample Words")
+ gr.Markdown("### Sampled Sentence Trees")
+
+ tree2_plots = []
+ # Add elem_id to make this tab container scrollable
+ with gr.Tabs(elem_id="sampling-tabs") as tree2_tabs:
+ for i in range(10):
+ with gr.TabItem(f"Sampled Sentence {i+1}"):
+ # Add a custom class to the container to enable proper styling
+ with gr.Column(elem_classes=["plot-container"]):
+ tree2 = gr.Plot()
+ tree2_plots.append(tree2)
+
+ with gr.Row():
+ with gr.Column(scale=3):
+ gr.Markdown("## Step 4: Re-paraphrasing")
+ with gr.Column(scale=1):
+ step4_time = gr.HTML()
+
+ reparaphrase_button = gr.Button("Re-paraphrase")
+ gr.Markdown("### Reparaphrased Sentences")
+ reparaphrased_sentences_tabs = []
+ with gr.Tabs(elem_id="reparaphrased-tabs") as reparaphrased_tabs:
+ for i in range(120):
+ with gr.TabItem(f"Reparaphrased Batch {i+1}"):
+ reparaphrased_sent_html = gr.HTML()
+ reparaphrased_sentences_tabs.append(reparaphrased_sent_html)
+
+ # Connect the interface functions to the buttons
+ paraphrase_button.click(
+ interface.handle_paraphrase,
+ inputs=user_input,
+ outputs=[
+ highlighted_user_prompt,
+ highlighted_accepted_sentences,
+ highlighted_discarded_sentences,
+ step1_time
+ ]
+ )
+
+ masking_button.click(
+ interface.handle_masking,
+ inputs=None,
+ outputs=tree1_plots + [step2_time]
+ )
+
+ sampling_button.click(
+ interface.handle_sampling,
+ inputs=None,
+ outputs=tree2_plots + [step3_time]
+ )
+
+ reparaphrase_button.click(
+ interface.handle_reparaphrasing,
+ inputs=None,
+ outputs=reparaphrased_sentences_tabs + [step4_time]
+ )
+
+ return demo
+
+if __name__ == "__main__":
+ project_root = Path(__file__).parent.parent
+ config_path = project_root / "utils" / "config.yaml"
+ config = load_config(config_path)['PECCAVI_TEXT']
+
+ create_gradio_interface(config).launch()
\ No newline at end of file
diff --git a/__pycache__/app.cpython-310.pyc b/__pycache__/app.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94f422246f604689537b70b161c46fc265e03b32
Binary files /dev/null and b/__pycache__/app.cpython-310.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..f008ea4cd351a03df9745e357487f55888faaaeb
--- /dev/null
+++ b/app.py
@@ -0,0 +1,21 @@
+
+import gradio as gr
+from UI.gradio import create_gradio_interface
+
+from pathlib import Path
+from utils.config import load_config
+
+project_root = Path(__file__).resolve().parent
+config_path = project_root / "utils" / "config.yaml"
+config = load_config(config_path)['PECCAVI_TEXT']
+
+def main():
+ """
+ This function is the entry point for the PECCAVI Watermarking Model.
+
+ It creates the Gradio interface for the model and runs it.
+ """
+ create_gradio_interface(config).launch()
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7c6c7b53a6b89d6d2e7eb4487fea47cb0d4b1ed6
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,245 @@
+name: panda
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=conda_forge
+ - _openmp_mutex=4.5=2_gnu
+ - asttokens=2.4.1=pyhd8ed1ab_0
+ - bzip2=1.0.8=h5eee18b_6
+ - ca-certificates=2024.8.30=hbcca054_0
+ - comm=0.2.2=pyhd8ed1ab_0
+ - debugpy=1.8.6=py310hf71b8c6_0
+ - decorator=5.1.1=pyhd8ed1ab_0
+ - exceptiongroup=1.2.2=pyhd8ed1ab_0
+ - executing=2.1.0=pyhd8ed1ab_0
+ - ipykernel=6.29.5=pyh3099207_0
+ - ipython=8.27.0=pyh707e725_0
+ - jedi=0.19.1=pyhd8ed1ab_0
+ - jupyter_client=8.6.3=pyhd8ed1ab_0
+ - jupyter_core=5.7.2=pyh31011fe_1
+ - krb5=1.21.3=h143b758_0
+ - ld_impl_linux-64=2.40=h12ee557_0
+ - libedit=3.1.20230828=h5eee18b_0
+ - libffi=3.4.4=h6a678d5_1
+ - libgcc=14.1.0=h77fa898_1
+ - libgcc-ng=14.1.0=h69a702a_1
+ - libgomp=14.1.0=h77fa898_1
+ - libsodium=1.0.20=h4ab18f5_0
+ - libstdcxx=14.1.0=hc0a3c3a_1
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - libuuid=1.41.5=h5eee18b_0
+ - matplotlib-inline=0.1.7=pyhd8ed1ab_0
+ - ncurses=6.4=h6a678d5_0
+ - nest-asyncio=1.6.0=pyhd8ed1ab_0
+ - openssl=3.3.2=hb9d3cd8_0
+ - packaging=24.1=pyhd8ed1ab_0
+ - parso=0.8.4=pyhd8ed1ab_0
+ - pexpect=4.9.0=pyhd8ed1ab_0
+ - pickleshare=0.7.5=py_1003
+ - pip=24.2=py310h06a4308_0
+ - platformdirs=4.3.6=pyhd8ed1ab_0
+ - prompt-toolkit=3.0.48=pyha770c72_0
+ - ptyprocess=0.7.0=pyhd3deb0d_0
+ - pure_eval=0.2.3=pyhd8ed1ab_0
+ - pygments=2.18.0=pyhd8ed1ab_0
+ - python=3.10.14=h955ad1f_1
+ - python_abi=3.10=2_cp310
+ - pyzmq=26.2.0=py310h71f11fc_2
+ - readline=8.2=h5eee18b_0
+ - setuptools=75.1.0=py310h06a4308_0
+ - sqlite=3.45.3=h5eee18b_0
+ - stack_data=0.6.2=pyhd8ed1ab_0
+ - tk=8.6.14=h39e8969_0
+ - tornado=6.4.1=py310ha75aee5_1
+ - traitlets=5.14.3=pyhd8ed1ab_0
+ - typing_extensions=4.12.2=pyha770c72_0
+ - wcwidth=0.2.13=pyhd8ed1ab_0
+ - wheel=0.44.0=py310h06a4308_0
+ - xz=5.4.6=h5eee18b_1
+ - zeromq=4.3.5=ha4adb4c_5
+ - zlib=1.2.13=h5eee18b_1
+ - pip:
+ - absl-py==2.1.0
+ - accelerate==0.33.0
+ - aiofiles==23.2.1
+ - aiohappyeyeballs==2.3.5
+ - aiohttp==3.10.3
+ - aiosignal==1.3.1
+ - altgraph==0.17.4
+ - annotated-types==0.7.0
+ - anyio==4.6.0
+ - astunparse==1.6.3
+ - async-timeout==4.0.3
+ - attrs==24.2.0
+ - av==12.0.0
+ - backports-tarfile==1.2.0
+ - beautifulsoup4==4.12.3
+ - build==1.2.2
+ - cachetools==5.5.0
+ - certifi==2024.7.4
+ - cffi==1.17.1
+ - charset-normalizer==3.3.2
+ - clean-fid==0.1.35
+ - click==8.1.7
+ - colorama==0.4.6
+ - contextlib2==21.6.0
+ - contourpy==1.2.1
+ - cryptography==43.0.1
+ - cycler==0.12.1
+ - datasets==2.21.0
+ - diffusers==0.27.2
+ - dill==0.3.8
+ - docker-pycreds==0.4.0
+ - docutils==0.21.2
+ - fastapi==0.115.0
+ - ffmpy==0.4.0
+ - filelock==3.15.4
+ - flatbuffers==24.3.25
+ - fonttools==4.53.1
+ - frozenlist==1.4.1
+ - fsspec==2024.6.1
+ - gast==0.4.0
+ - gdown==5.2.0
+ - gitdb==4.0.11
+ - gitpython==3.1.43
+ - google-auth==2.35.0
+ - google-auth-oauthlib==0.4.6
+ - google-pasta==0.2.0
+ - gradio==4.44.0
+ - gradio-client==1.3.0
+ - grpcio==1.65.4
+ - h11==0.14.0
+ - h5py==3.11.0
+ - httpcore==1.0.6
+ - httpx==0.27.2
+ - huggingface-hub==0.25.2
+ - idna==3.7
+ - imageio==2.35.0
+ - importlib-metadata==8.2.0
+ - importlib-resources==6.4.5
+ - jaraco-classes==3.4.0
+ - jaraco-context==6.0.1
+ - jaraco-functools==4.1.0
+ - jeepney==0.8.0
+ - jinja2==3.1.4
+ - joblib==1.4.2
+ - json-with-comments==1.2.7
+ - keras==3.5.0
+ - keras-preprocessing==1.1.2
+ - keyring==25.4.1
+ - kiwisolver==1.4.5
+ - kornia==0.7.4
+ - kornia-rs==0.1.7
+ - lazy-loader==0.4
+ - libclang==18.1.1
+ - markdown==3.6
+ - markdown-it-py==3.0.0
+ - markupsafe==2.1.5
+ - matplotlib==3.9.2
+ - mdurl==0.1.2
+ - ml-collections==0.1.1
+ - ml-dtypes==0.4.0
+ - more-itertools==10.5.0
+ - multidict==6.0.5
+ - multiprocess==0.70.16
+ - namex==0.0.8
+ - networkx==3.3
+ - nh3==0.2.18
+ - nltk==3.9.1
+ - numpy==1.26.4
+ - nvidia-cublas-cu11==11.10.3.66
+ - nvidia-cuda-nvrtc-cu11==11.7.99
+ - nvidia-cuda-runtime-cu11==11.7.99
+ - nvidia-cudnn-cu11==8.5.0.96
+ - oauthlib==3.2.2
+ - opencv-python==4.10.0.84
+ - opencv-python-headless==4.10.0.84
+ - opt-einsum==3.3.0
+ - optree==0.12.1
+ - orjson==3.10.7
+ - pandas==2.2.2
+ - pillow==10.4.0
+ - pkginfo==1.10.0
+ - plotly==5.24.1
+ - protobuf==4.25.5
+ - psutil==5.9.8
+ - pyarrow==17.0.0
+ - pyasn1==0.6.1
+ - pyasn1-modules==0.4.1
+ - pycparser==2.22
+ - pydantic==2.9.2
+ - pydantic-core==2.23.4
+ - pydub==0.25.1
+ - pyinstaller==6.10.0
+ - pyinstaller-hooks-contrib==2024.8
+ - pyparsing==3.1.2
+ - pyproject-hooks==1.1.0
+ - pysocks==1.7.1
+ - python-dateutil==2.9.0.post0
+ - python-multipart==0.0.12
+ - pytorch-msssim==1.0.0
+ - pytorchcv==0.0.73
+ - pytz==2023.3.post1
+ - pyyaml==6.0.2
+ - readme-renderer==44.0
+ - regex==2024.7.24
+ - requests==2.32.3
+ - requests-oauthlib==2.0.0
+ - requests-toolbelt==1.0.0
+ - rfc3986==2.0.0
+ - rich==13.7.1
+ - rsa==4.9
+ - ruff==0.6.9
+ - safetensors==0.4.4
+ - saliency==0.2.1
+ - scikit-image==0.24.0
+ - scikit-learn==1.6.0
+ - scipy==1.14.0
+ - secretstorage==3.3.3
+ - semantic-version==2.10.0
+ - sentence-transformers==3.3.1
+ - sentry-sdk==2.15.0
+ - setproctitle==1.3.3
+ - shapely==2.0.5
+ - shellingham==1.5.4
+ - six==1.12.0
+ - smmap==5.0.1
+ - sniffio==1.3.1
+ - soupsieve==2.6
+ - spaces==0.30.2
+ - starlette==0.38.6
+ - tenacity==9.0.0
+ - tensorboard==2.17.1
+ - tensorboard-data-server==0.7.2
+ - tensorboard-plugin-wit==1.8.1
+ - tensorflow==2.17.0
+ - tensorflow-estimator==2.10.0
+ - tensorflow-hub==0.16.1
+ - tensorflow-intel==0.0.1
+ - tensorflow-io-gcs-filesystem==0.31.0
+ - termcolor==1.1.0
+ - tf-keras==2.17.0
+ - threadpoolctl==3.5.0
+ - tifffile==2024.8.10
+ - timm==1.0.10
+ - tokenizers==0.19.1
+ - tomli==2.0.1
+ - tomlkit==0.12.0
+ - torch==1.13.1
+ - torchvision==0.14.1
+ - tqdm==4.66.5
+ - transformers==4.43.3
+ - twine==5.1.1
+ - typer==0.12.5
+ - tzdata==2024.1
+ - urllib3==2.2.2
+ - uvicorn==0.31.0
+ - wandb==0.18.3
+ - websockets==12.0
+ - werkzeug==3.0.4
+ - wrapt==1.11.2
+ - xxhash==3.4.1
+ - yarl==1.9.4
+ - zipp==3.20.0
+prefix: /home/ashhar21137/miniconda3/envs/panda
diff --git a/metrics/distortion.py b/metrics/distortion.py
new file mode 100644
index 0000000000000000000000000000000000000000..823724c5e0aaa928d655b4861374c87d650b576c
--- /dev/null
+++ b/metrics/distortion.py
@@ -0,0 +1,370 @@
+import os
+import sys
+from tqdm import tqdm
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from bert_score import BERTScorer
+from bert_score.utils import model2layers
+from nltk.tokenize import word_tokenize
+from Levenshtein import distance as levenshtein_distance
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from scipy.spatial.distance import cdist
+from scipy.optimize import linear_sum_assignment
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from config.config import load_config
+config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+config = load_config(config_path)['PECCAVI_TEXT']['Metrics']
+
+class SentenceDistortionCalculator:
+ """
+ A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
+ """
+ def __init__(self, config, original_sentence, paraphrased_sentences):
+ """
+ Initialize the calculator with the original sentence and a list of modified sentences.
+ """
+ self.original_sentence = original_sentence
+ self.paraphrased_sentences = paraphrased_sentences
+
+ self.levenshtein_distances = {}
+ self.bert_scores = {}
+ self.mover_scores = {}
+
+ self.normalized_levenshtein = {}
+ self.normalized_bert_scores = {}
+ self.normalized_mover_scores = {}
+ self.combined_distortions = {}
+
+ self.tokenizer = GPT2TokenizerFast.from_pretrained(config['Distortion'])
+ self.model = GPT2LMHeadModel.from_pretrained(config['Distortion'])
+ self.model.eval()
+
+ def calculate_all_metrics(self):
+ """
+ Calculate all distortion metrics for each modified sentence.
+ """
+ for idx, modified_sentence in tqdm(enumerate(self.paraphrased_sentences), total=len(self.paraphrased_sentences), desc="Calculating Metrics"):
+ key = f"Sentence_{idx+1}"
+ self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence)
+ self.bert_scores[key] = self._calculate_bert_score(modified_sentence)
+ self.mover_scores[key] = self._calculate_mover_score(modified_sentence)
+
+
+ def normalize_metrics(self):
+ """
+ Normalize all metrics to be between 0 and 1.
+ """
+ for _ in tqdm(range(1), desc="Normalizing Metrics"): # Add tqdm here (wrap the normalization process)
+ self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances)
+ self.normalized_bert_scores = self._normalize_dict(self.bert_scores)
+ self.normalized_mover_scores = self._normalize_dict(self.mover_scores)
+
+ def calculate_combined_distortion(self):
+ """
+ Calculate the combined distortion using the root mean square of the normalized metrics.
+ """
+ for _ in tqdm(range(1), desc="Calculating Combined Distortion"): # Add tqdm here
+ for key in self.normalized_levenshtein.keys():
+ rms = np.sqrt(
+ (
+ self.normalized_levenshtein[key] ** 2 +
+ self.normalized_bert_scores[key] ** 2+
+ self.normalized_mover_scores[key] **2
+ ) / 3
+ )
+ self.combined_distortions[key] = rms
+
+ def plot_metrics(self):
+ """
+ Plot each normalized metric and the combined distortion in separate graphs.
+ """
+ keys = list(self.normalized_levenshtein.keys())
+ indices = np.arange(len(keys))
+
+ # Prepare data for plotting
+ metrics = {
+ 'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys],
+ 'BERTScore': [self.normalized_bert_scores[key] for key in keys],
+ 'MOVERscore':[self.normalized_mover_scores[key] for key in keys],
+ 'Combined Distortion': [self.combined_distortions[key] for key in keys]
+ }
+
+ # Plot each metric separately
+ for metric_name, values in tqdm(metrics.items(), desc="Plotting Metrics"): # Add tqdm here
+ plt.figure(figsize=(12, 6))
+ plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+ plt.xlabel('Sentence Index')
+ plt.ylabel('Normalized Value (0-1)')
+ plt.title(f'Normalized {metric_name}')
+ plt.grid(True)
+ plt.tight_layout()
+ plt.show()
+
+ def _calculate_levenshtein_distance(self, modified_sentence):
+ """
+ Calculate the word-level Levenshtein distance between the original and modified sentence.
+ """
+ words1 = word_tokenize(self.original_sentence)
+ words2 = word_tokenize(modified_sentence)
+ lev_distance = levenshtein_distance(words1, words2)
+ return (lev_distance / max(len(words1), len(words2)))
+
+ def _calculate_bert_score(self, modified_sentence):
+ """
+ Compute the BERTScore similarity between the original and modified sentence.
+ Returns 1 - F1 score to represent dissimilarity.
+ """
+ if not hasattr(self, 'original_sentence'):
+ raise ValueError("original_sentence is not set. Please set self.original_sentence before calling this function.")
+ if not isinstance(modified_sentence, str):
+ raise ValueError("modified_sentence must be a string.")
+
+ model_type = "microsoft/deberta-xlarge-mnli"
+ num_layers = model2layers[model_type]
+
+ if not hasattr(self, "cached_bertscorer"):
+ self.cached_bertscorer = BERTScorer(
+ model_type=model_type,
+ num_layers=num_layers,
+ batch_size=1, # Single sentence comparison
+ nthreads=4,
+ all_layers=False,
+ idf=False,
+ device="cuda" if torch.cuda.is_available() else "cpu",
+ lang="en"
+ )
+
+ # Compute BERTScore
+ _, _, F1 = self.cached_bertscorer.score(
+ cands=[modified_sentence],
+ refs=[self.original_sentence],
+ verbose=False,
+ batch_size=1
+ )
+
+ return 1 - F1.item() # Return dissimilarity score
+ def _calculate_mover_score(self,modified_sentence,model_name='all-MiniLM-L6-v2'):
+ """Compute MoverScore correctly using word-level embeddings."""
+ if not self.original_sentence:
+ raise ValueError("Original sentence not provided.")
+
+ # Tokenize sentences
+ original_tokens = self.original_sentence.split()
+ modified_tokens = modified_sentence.split()
+ model = SentenceTransformer(model_name)
+
+ # Compute word embeddings
+ original_embeddings = model.encode(original_tokens, convert_to_numpy=True)
+ modified_embeddings = model.encode(modified_tokens, convert_to_numpy=True)
+
+ # Compute cost matrix (cosine distance)
+ cost_matrix = cdist(original_embeddings, modified_embeddings, metric='cosine')
+
+ # Solve optimal transport problem (Hungarian Algorithm)
+ row_ind, col_ind = linear_sum_assignment(cost_matrix)
+
+ # Compute IDF weights
+ vectorizer = TfidfVectorizer()
+ vectorizer.fit([self.original_sentence, modified_sentence])
+ idf_values = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
+
+ # Apply IDF weighting to aligned word pairs
+ idf_weights_original = np.array([idf_values.get(word.lower(), 1.0) for word in original_tokens])
+ idf_weights_modified = np.array([idf_values.get(word.lower(), 1.0) for word in modified_tokens])
+ combined_idf_weights = (idf_weights_original[row_ind] + idf_weights_modified[col_ind]) / 2
+ weighted_score = np.sum((1 - cost_matrix[row_ind, col_ind]) * combined_idf_weights) / np.sum(combined_idf_weights)
+
+ return 1-weighted_score # Higher score = more dissimilar
+
+ def _normalize_dict(self, metric_dict):
+ """
+ Normalize the values in a dictionary to be between 0 and 1.
+ """
+ values = np.array(list(metric_dict.values()))
+ min_val = values.min()
+ max_val = values.max()
+ if max_val - min_val == 0:
+ normalized_values = np.zeros_like(values)
+ else:
+ normalized_values = (values - min_val) / (max_val - min_val)
+ return dict(zip(metric_dict.keys(), normalized_values))
+
+ def get_normalized_metrics(self):
+ """
+ Get all normalized metrics as a dictionary.
+ """
+ return {
+ 'Min Edit Distance': self.normalized_levenshtein,
+ 'BERTScore': self.normalized_bert_scores,
+ 'Mover Score': self.normalized_mover_scores
+ }
+
+ def get_combined_distortions(self):
+ """
+ Get the dictionary of combined distortion values.
+ """
+ return self.combined_distortions
+
+# Example usage
+if __name__ == "__main__":
+
+ config = load_config(config_path)['PECCAVI_TEXT']['Metrics']
+
+ # Original sentence
+ original_sentence = "The quick brown fox jumps over the lazy dog"
+
+ # Paraphrased sentences
+ paraphrased_sentences = [
+ # Original 1: "A swift auburn fox leaps across a sleepy canine."
+ "The swift auburn fox leaps across a sleepy canine.",
+ "A quick auburn fox leaps across a sleepy canine.",
+ "A swift ginger fox leaps across a sleepy canine.",
+ "A swift auburn fox bounds across a sleepy canine.",
+ "A swift auburn fox leaps across a tired canine.",
+ "Three swift auburn foxes leap across a sleepy canine.",
+ "The vulpine specimen rapidly traverses over a dormant dog.",
+ "Like lightning, the russet hunter soars over the drowsy guardian.",
+ "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+ "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+ "A swift auburn predator navigates across a lethargic pet.",
+ "Subject A (fox) demonstrates velocity over Subject B (dog).",
+
+ # Original 2: "The agile russet fox bounds over an idle hound."
+ "Some agile russet foxes bound over an idle hound.",
+ "The nimble russet fox bounds over an idle hound.",
+ "The agile brown fox bounds over an idle hound.",
+ "The agile russet fox jumps over an idle hound.",
+ "The agile russet fox bounds over a lazy hound.",
+ "Two agile russet foxes bound over an idle hound.",
+ "A dexterous vulpine surpasses a stationary canine.",
+ "Quick as thought, the copper warrior sails over the guardian.",
+ "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+ "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+ "An agile russet hunter maneuvers above a resting hound.",
+ "Test subject F-1 achieves displacement superior to subject D-1.",
+
+ # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+ "The nimble mahogany vulpine vaults above a drowsy dog.",
+ "A swift mahogany vulpine vaults above a drowsy dog.",
+ "A nimble reddish vulpine vaults above a drowsy dog.",
+ "A nimble mahogany fox vaults above a drowsy dog.",
+ "A nimble mahogany vulpine leaps above a drowsy dog.",
+ "Four nimble mahogany vulpines vault above a drowsy dog.",
+ "An agile specimen of reddish fur surpasses a somnolent canine.",
+ "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+ "Tha quick brown beastie jumps o'er the tired pup, aye.",
+ "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+ "A nimble rust-colored predator crosses above a drowsy pet.",
+ "Observed: Subject Red executes vertical motion over Subject Gray.",
+
+ # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+ "A speedy copper-colored fox hops over the lethargic pup.",
+ "The quick copper-colored fox hops over the lethargic pup.",
+ "The speedy bronze fox hops over the lethargic pup.",
+ "The speedy copper-colored fox jumps over the lethargic pup.",
+ "The speedy copper-colored fox hops over the tired pup.",
+ "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+ "A rapid vulpine of bronze hue traverses an inactive young canine.",
+ "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+ "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+ "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+ "A fleet copper-toned predator moves past a sluggish young dog.",
+ "Field note: Adult fox subject exceeds puppy subject vertically.",
+
+ # Original 5: "A rapid tawny fox springs over a sluggish dog."
+ "The rapid tawny fox springs over a sluggish dog.",
+ "A quick tawny fox springs over a sluggish dog.",
+ "A rapid golden fox springs over a sluggish dog.",
+ "A rapid tawny fox jumps over a sluggish dog.",
+ "A rapid tawny fox springs over a lazy dog.",
+ "Six rapid tawny foxes spring over a sluggish dog.",
+ "An expeditious yellowish vulpine surpasses a torpid canine.",
+ "Fast as a bullet, the golden hunter vaults over the idle guard.",
+ "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+ "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+ "A speedy yellow-brown predator bypasses a motionless dog.",
+ "Log entry: Vulpine subject achieves swift vertical displacement.",
+
+ # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+ "A fleet-footed chestnut fox soars above an indolent canine.",
+ "The swift chestnut fox soars above an indolent canine.",
+ "The fleet-footed brown fox soars above an indolent canine.",
+ "The fleet-footed chestnut fox leaps above an indolent canine.",
+ "The fleet-footed chestnut fox soars above a lazy canine.",
+ "Several fleet-footed chestnut foxes soar above an indolent canine.",
+ "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+ "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+ "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+ "Single agile V. vulpes achieves elevation above stationary canine.",
+ "A nimble brown predator glides over an unmoving domestic animal.",
+ "Research note: Brown subject displays superior vertical mobility.",
+
+ # Original 7: "A fast ginger fox hurdles past a slothful dog."
+ "The fast ginger fox hurdles past a slothful dog.",
+ "A quick ginger fox hurdles past a slothful dog.",
+ "A fast red fox hurdles past a slothful dog.",
+ "A fast ginger fox jumps past a slothful dog.",
+ "A fast ginger fox hurdles past a lazy dog.",
+ "Five fast ginger foxes hurdle past a slothful dog.",
+ "A rapid orange vulpine bypasses a lethargic canine.",
+ "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+ "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+ "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+ "A speedy red-orange predator overtakes a motionless dog.",
+ "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+
+ # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+ "A spry rusty-colored fox jumps across a dozing hound.",
+ "The agile rusty-colored fox jumps across a dozing hound.",
+ "The spry reddish fox jumps across a dozing hound.",
+ "The spry rusty-colored fox leaps across a dozing hound.",
+ "The spry rusty-colored fox jumps across a sleeping hound.",
+ "Multiple spry rusty-colored foxes jump across a dozing hound.",
+ "An agile rust-toned vulpine traverses a somnolent canine.",
+ "Nimble as thought, the copper hunter bounds over the resting guard.",
+ "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+ "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+ "A lithe rust-tinted predator moves past a slumbering dog.",
+ "Observation: Russet subject exhibits agility over dormant subject.",
+
+ # Original 9: "A quick tan fox leaps over an inactive dog."
+ "The quick tan fox leaps over an inactive dog.",
+ "A swift tan fox leaps over an inactive dog.",
+ "A quick beige fox leaps over an inactive dog.",
+ "A quick tan fox jumps over an inactive dog.",
+ "A quick tan fox leaps over a motionless dog.",
+ "Seven quick tan foxes leap over an inactive dog.",
+ "A rapid light-brown vulpine surpasses a stationary canine.",
+ "Fast as wind, the sand-colored hunter soars over the still guard.",
+ "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+ "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+ "A fleet tan-colored predator bypasses an unmoving dog.",
+ "Field report: Tan subject demonstrates movement over static subject.",
+
+ # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+ "Some brisk auburn vulpines bounce over a listless canine.",
+ "The quick auburn vulpine bounces over a listless canine.",
+ "The brisk russet vulpine bounces over a listless canine.",
+ "The brisk auburn fox bounces over a listless canine.",
+ "The brisk auburn vulpine jumps over a listless canine.",
+ "Five brisk auburn vulpines bounce over a listless canine.",
+ "The expeditious specimen supersedes a quiescent Canis lupus.",
+ "Swift as wind, the russet hunter vaults over the idle guardian.",
+ "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+ "One V. vulpes achieves displacement over inactive C. familiaris.",
+ "A high-velocity auburn predator traverses an immobile animal.",
+ "Final observation: Red subject shows mobility over Gray subject."
+ ]
+
+ distortion_calculator = SentenceDistortionCalculator(config, original_sentence, paraphrased_sentences)
+ for _ in tqdm(range(1)):
+ distortion_calculator.calculate_all_metrics()
+ distortion_calculator.normalize_metrics()
+ distortion_calculator.calculate_combined_distortion()
+ distortion_calculator.plot_metrics()
+ print("Normalized Metrics:", distortion_calculator.get_normalized_metrics())
+ print("Combined Distortion:", distortion_calculator.get_combined_distortions())
\ No newline at end of file
diff --git a/renderers/__pycache__/highlighter.cpython-310.pyc b/renderers/__pycache__/highlighter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..510ec095660081fa4720670c142da6842e77f377
Binary files /dev/null and b/renderers/__pycache__/highlighter.cpython-310.pyc differ
diff --git a/renderers/__pycache__/highlighter.cpython-311.pyc b/renderers/__pycache__/highlighter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ab39a71b59353506700760768ad5c3f09fe3f8b
Binary files /dev/null and b/renderers/__pycache__/highlighter.cpython-311.pyc differ
diff --git a/renderers/__pycache__/plot_3d.cpython-310.pyc b/renderers/__pycache__/plot_3d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e793dc94a8945951cce597a5439ad5512dec6bb9
Binary files /dev/null and b/renderers/__pycache__/plot_3d.cpython-310.pyc differ
diff --git a/renderers/__pycache__/plot_3d.cpython-311.pyc b/renderers/__pycache__/plot_3d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dae5f8085f8aefeeaf78f7d81ce31f8f607ed9d
Binary files /dev/null and b/renderers/__pycache__/plot_3d.cpython-311.pyc differ
diff --git a/renderers/__pycache__/tree.cpython-310.pyc b/renderers/__pycache__/tree.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..233737f1a6b7ee72033417aafb41980c35f7cbde
Binary files /dev/null and b/renderers/__pycache__/tree.cpython-310.pyc differ
diff --git a/renderers/__pycache__/tree.cpython-311.pyc b/renderers/__pycache__/tree.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fc03ee206c2a9ff0fc0efdc5d64287e521a2e3a
Binary files /dev/null and b/renderers/__pycache__/tree.cpython-311.pyc differ
diff --git a/renderers/highlighter.py b/renderers/highlighter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f09fe17ea7cd4d5f0261e4dc416bd9f7b66ae8b
--- /dev/null
+++ b/renderers/highlighter.py
@@ -0,0 +1,162 @@
+import re
+
+def highlight_common_words(common_words, sentences, title):
+ """
+ Highlight common words in sentences by adding color-coded background and unique IDs.
+
+ Args:
+ common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
+ sentences (list of str): List of sentences to search through.
+ title (str): The title for the HTML output.
+
+ Returns:
+ str: HTML string with the highlighted sentences.
+ """
+ color_map = {}
+ color_index = 0
+ highlighted_html = []
+
+ # Process each sentence
+ for idx, sentence in enumerate(sentences, start=1):
+ sentence_with_idx = f"{idx}. {sentence}"
+ highlighted_sentence = sentence_with_idx
+
+ # Highlight common words in each sentence
+ for index, word in common_words:
+ if word not in color_map:
+ color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
+ color_index += 1
+
+ # Escape word and create regex pattern to match whole word
+ escaped_word = re.escape(word)
+ pattern = rf'\b{escaped_word}\b'
+
+ # Replace the word with highlighted version
+ highlighted_sentence = re.sub(
+ pattern,
+ lambda m, idx=index, color=color_map[word]: (
+ f''
+ f'{idx}'
+ f'{m.group(0)}'
+ f''
+ ),
+ highlighted_sentence,
+ flags=re.IGNORECASE
+ )
+
+ highlighted_html.append(highlighted_sentence)
+
+ # Format the HTML output with the title
+ final_html = "
".join(highlighted_html)
+ return f'''
+
+
{title}
+
{final_html}
+
+ '''
+
+def highlight_common_words_dict(common_words, sentences, title):
+ """
+ Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs.
+
+ Args:
+ common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
+ sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score.
+ title (str): The title for the HTML output.
+
+ Returns:
+ str: HTML string with the highlighted sentences and their entailment scores.
+ """
+ color_map = {}
+ color_index = 0
+ highlighted_html = []
+
+ # Process each sentence and its score
+ for idx, (sentence, score) in enumerate(sentences.items(), start=1):
+ sentence_with_idx = f"{idx}. {sentence}"
+ highlighted_sentence = sentence_with_idx
+
+ # Highlight common words in each sentence
+ for index, word in common_words:
+ if word not in color_map:
+ color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
+ color_index += 1
+
+ # Escape word and create regex pattern to match whole word
+ escaped_word = re.escape(word)
+ pattern = rf'\b{escaped_word}\b'
+
+ # Replace the word with highlighted version
+ highlighted_sentence = re.sub(
+ pattern,
+ lambda m, idx=index, color=color_map[word]: (
+ f''
+ f'{idx}'
+ f'{m.group(0)}'
+ f''
+ ),
+ highlighted_sentence,
+ flags=re.IGNORECASE
+ )
+
+ # Add the entailment score
+ highlighted_html.append(
+ f''
+ f'{highlighted_sentence}'
+ f'
'
+ f'Entailment Score: {score}
'
+ )
+
+ # Format the HTML output with the title
+ final_html = "
".join(highlighted_html)
+ return f'''
+
+
{title}
+
{final_html}
+
+ '''
+
+def reparaphrased_sentences_html(sentences):
+ """
+ Create an HTML representation of sentences with numbering.
+
+ Args:
+ sentences (list of str): List of sentences to format.
+
+ Returns:
+ str: HTML string with numbered sentences.
+ """
+ formatted_sentences = []
+
+ # Process each sentence
+ for idx, sentence in enumerate(sentences, start=1):
+ sentence_with_idx = f"{idx}. {sentence}"
+ formatted_sentences.append(sentence_with_idx)
+
+ # Format the HTML output
+ final_html = "
".join(formatted_sentences)
+ return f'''
+
+ '''
+
+if __name__ == "__main__":
+ # Example usage
+ common_words = [(1, "highlight"), (2, "numbering")]
+ sentences = ["This is a test to highlight words.", "Numbering is important for clarity."]
+
+ # Test highlight_common_words
+ highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
+ print(highlighted_html)
+
+ # Test highlight_common_words_dict
+ sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8}
+ highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
+ print(highlighted_html_dict)
diff --git a/renderers/plot_3d.py b/renderers/plot_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3355c77b307fd95129c37ff5e5ae5d3de15751c1
--- /dev/null
+++ b/renderers/plot_3d.py
@@ -0,0 +1,126 @@
+"""
+This file contains the code to plot a 3d tree
+"""
+import numpy as np
+import plotly.graph_objects as go
+from scipy.interpolate import griddata
+
+def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
+ """
+ Generates a 3D surface plot showing the relationship between detectability, distortion,
+ and Euclidean distance, with a focus on highlighting the "sweet spot" based on a composite score.
+
+ The function takes three sets of values: detectability, distortion, and Euclidean distance,
+ normalizes them to a [0, 1] range, and computes a composite score that combines these three metrics.
+ The "sweet spot" is the point where the composite score is maximized. This sweet spot is plotted
+ as a red marker on the 3D surface plot.
+
+ The function then uses a grid interpolation method (`griddata`) to generate a smooth surface
+ for the Euclidean distance over the detectability and distortion values. The result is a surface plot
+ where the contours represent different Euclidean distances.
+
+ Args:
+ detectability_val (list or array): A list or array of detectability scores.
+ distortion_val (list or array): A list or array of distortion scores.
+ euclidean_val (list or array): A list or array of Euclidean distances.
+
+ Returns:
+ plotly.graph_objects.Figure: A Plotly figure object representing the 3D surface plot,
+ with contour lines and a marker for the sweet spot.
+
+ Raises:
+ ValueError: If `griddata` fails to generate a valid interpolation, which could happen if the
+ input data does not allow for a proper interpolation.
+
+ Example:
+ # Example of usage:
+ detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
+ distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0]
+ euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6]
+
+ fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals)
+ fig.show() # Displays the plot in a web browser
+
+ Notes:
+ - The composite score is calculated as:
+ `composite_score = norm_detectability - (norm_distortion + norm_euclidean)`,
+ where the goal is to maximize detectability and minimize distortion and Euclidean distance.
+ - The `griddata` function uses linear interpolation to create a smooth surface for the plot.
+ - The function uses the "Plasma" colorscale for the surface plot, which provides a perceptually uniform color scheme.
+ """
+
+ detectability = np.array(detectability_val)
+ distortion = np.array(distortion_val)
+ euclidean = np.array(euclidean_val)
+
+ # Normalize the values to range [0, 1]
+ norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability))
+ norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion))
+ norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean))
+
+ # Composite score: maximize detectability, minimize distortion and Euclidean distance
+ composite_score = norm_detectability - (norm_distortion + norm_euclidean)
+
+ # Find the index of the maximum score (sweet spot)
+ sweet_spot_index = np.argmax(composite_score)
+
+ # Sweet spot values
+ sweet_spot_detectability = detectability[sweet_spot_index]
+ sweet_spot_distortion = distortion[sweet_spot_index]
+ sweet_spot_euclidean = euclidean[sweet_spot_index]
+
+ # Create a meshgrid from the data
+ x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
+ np.linspace(min(distortion), max(distortion), 30))
+
+ # Interpolate z values (Euclidean distances) to fit the grid using 'nearest' method
+ z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='nearest')
+
+ if z_grid is None:
+ raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
+
+ # Create the 3D contour plot with the Plasma color scale
+ fig = go.Figure(data=go.Surface(
+ z=z_grid,
+ x=x_grid,
+ y=y_grid,
+ contours={
+ "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
+ },
+ colorscale='Plasma'
+ ))
+
+ # Add a marker for the sweet spot
+ fig.add_trace(go.Scatter3d(
+ x=[sweet_spot_detectability],
+ y=[sweet_spot_distortion],
+ z=[sweet_spot_euclidean],
+ mode='markers+text',
+ marker=dict(size=10, color='red', symbol='circle'),
+ text=["Sweet Spot"],
+ textposition="top center"
+ ))
+
+ # Set axis labels
+ fig.update_layout(
+ scene=dict(
+ xaxis_title='Detectability Score',
+ yaxis_title='Distortion Score',
+ zaxis_title='Euclidean Distance'
+ ),
+ margin=dict(l=0, r=0, b=0, t=0)
+ )
+
+ return fig
+
+if __name__ == "__main__":
+ # Example input data
+ detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
+ distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0]
+ euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6]
+
+ # Call the function with example data
+ fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals)
+
+ # Show the plot
+ fig.show()
\ No newline at end of file
diff --git a/renderers/tree.py b/renderers/tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec281b0238bb808ebb84e920c8191022daa194e
--- /dev/null
+++ b/renderers/tree.py
@@ -0,0 +1,490 @@
+import plotly.graph_objects as go
+import textwrap
+import re
+from collections import defaultdict
+
+def generate_subplot1(paraphrased_sentence, masked_sentences, strategies, highlight_info, common_grams):
+ """
+ Generates a subplot visualizing paraphrased and masked sentences in a tree structure.
+ Highlights common words with specific colors and applies Longest Common Subsequence (LCS) numbering.
+
+ Args:
+ paraphrased_sentence (str): The paraphrased sentence to be visualized.
+ masked_sentences (list of str): A list of masked sentences to be visualized.
+ strategies (list of str, optional): List of strategies used for each masked sentence.
+ highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting.
+ common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering.
+
+ Returns:
+ plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges.
+ """
+ # Combine nodes into one list with appropriate labels
+ if isinstance(masked_sentences, str):
+ masked_sentences = [masked_sentences]
+ nodes = [paraphrased_sentence] + masked_sentences
+ nodes[0] += ' L0' # Paraphrased sentence is level 0
+ if len(nodes) < 2:
+ print("[ERROR] Insufficient nodes for visualization")
+ return go.Figure()
+
+ for i in range(1, len(nodes)):
+ nodes[i] += ' L1' # masked sentences are level 1
+
+ def apply_lcs_numbering(sentence, common_grams):
+ """
+ Applies LCS numbering to the sentence based on the common_grams.
+
+ Args:
+ sentence (str): The sentence to which the LCS numbering should be applied.
+ common_grams (list of tuples): A list of common grams to be replaced with LCS numbers.
+
+ Returns:
+ str: The sentence with LCS numbering applied.
+ """
+ for idx, lcs in common_grams:
+ sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+ return sentence
+
+ # Apply LCS numbering
+ nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+
+
+ def highlight_words(sentence, color_map):
+ """
+ Highlights words in the sentence based on the color_map.
+
+ Args:
+ sentence (str): The sentence where the words will be highlighted.
+ color_map (dict): A dictionary mapping words to their colors.
+
+ Returns:
+ str: The sentence with highlighted words.
+ """
+ for word, color in color_map.items():
+ sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+ return sentence
+
+ # Clean and wrap nodes, and highlight specified words globally
+ cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+ global_color_map = dict(highlight_info)
+ highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+ wrapped_nodes = ['
'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes]
+
+ def get_levels_and_edges(nodes, strategies=None):
+ """
+ Determines tree levels and creates edges dynamically.
+
+ Args:
+ nodes (list of str): The nodes representing the sentences.
+ strategies (list of str, optional): The strategies used for each edge.
+
+ Returns:
+ tuple: A tuple containing two dictionaries:
+ - levels: A dictionary mapping node indices to their levels.
+ - edges: A list of edges where each edge is represented by a tuple of node indices.
+ """
+ levels = {}
+ edges = []
+ for i, node in enumerate(nodes):
+ level = int(node.split()[-1][1])
+ levels[i] = level
+
+ # Add edges from L0 to all L1 nodes
+ root_node = next((i for i, level in levels.items() if level == 0), 0)
+ for i, level in levels.items():
+ if level == 1:
+ edges.append((root_node, i))
+
+ return levels, edges
+
+ # Get levels and dynamic edges
+ levels, edges = get_levels_and_edges(nodes, strategies)
+ max_level = max(levels.values(), default=0)
+
+ # Calculate positions
+ positions = {}
+ level_heights = defaultdict(int)
+ for node, level in levels.items():
+ level_heights[level] += 1
+
+ y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+ x_gap = 2
+ l1_y_gap = 10
+
+ for node, level in levels.items():
+ if level == 1:
+ positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+ else:
+ positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+ y_offsets[level] += 1
+
+ def color_highlighted_words(node, color_map):
+ """
+ Colors the highlighted words in the node text.
+
+ Args:
+ node (str): The node text to be highlighted.
+ color_map (dict): A dictionary mapping words to their colors.
+
+ Returns:
+ str: The node text with highlighted words.
+ """
+ parts = re.split(r'(\{\{.*?\}\})', node)
+ colored_parts = []
+ for part in parts:
+ match = re.match(r'\{\{(.*?)\}\}', part)
+ if match:
+ word = match.group(1)
+ color = color_map.get(word, 'black')
+ colored_parts.append(f"{word}")
+ else:
+ colored_parts.append(part)
+ return ''.join(colored_parts)
+
+ # Define the text for each edge
+ default_edge_texts = [
+ "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
+ "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+ "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
+ "Exponential Minimum Sampling", "Inverse Transform Sampling", "Greedy Sampling",
+ "Temperature Sampling", "Exponential Minimum Sampling", "Inverse Transform Sampling"
+ ]
+
+ if len(nodes) < 2:
+ print("[ERROR] Insufficient nodes for visualization")
+ return go.Figure()
+
+ # Create figure
+ fig1 = go.Figure()
+
+ # Add nodes to the figure
+ for i, node in enumerate(wrapped_nodes):
+ colored_node = color_highlighted_words(node, global_color_map)
+ x, y = positions[i]
+ fig1.add_trace(go.Scatter(
+ x=[-x], # Reflect the x coordinate
+ y=[y],
+ mode='markers',
+ marker=dict(size=20, color='blue', line=dict(color='black', width=2)),
+ hoverinfo='none'
+ ))
+ fig1.add_annotation(
+ x=-x, # Reflect the x coordinate
+ y=y,
+ text=colored_node,
+ showarrow=False,
+ xshift=15,
+ align="center",
+ font=dict(size=12),
+ bordercolor='black',
+ borderwidth=2,
+ borderpad=4,
+ bgcolor='white',
+ width=400,
+ height=100
+ )
+
+ # Add edges and text above each edge
+ for i, edge in enumerate(edges):
+ x0, y0 = positions[edge[0]]
+ x1, y1 = positions[edge[1]]
+
+ # Use strategy if available, otherwise use default edge text
+ if strategies and i < len(strategies):
+ edge_text = strategies[i]
+ else:
+ edge_text = default_edge_texts[i % len(default_edge_texts)]
+
+ fig1.add_trace(go.Scatter(
+ x=[-x0, -x1], # Reflect the x coordinates
+ y=[y0, y1],
+ mode='lines',
+ line=dict(color='black', width=1)
+ ))
+
+ # Calculate the midpoint of the edge
+ mid_x = (-x0 + -x1) / 2
+ mid_y = (y0 + y1) / 2
+
+ # Adjust y position to shift text upwards
+ text_y_position = mid_y + 0.8 # Increase this value to shift the text further upwards
+
+ # Add text annotation above the edge
+ fig1.add_annotation(
+ x=mid_x,
+ y=text_y_position,
+ text=edge_text, # Use the text specific to this edge
+ showarrow=False,
+ font=dict(size=12),
+ align="center"
+ )
+
+ fig1.update_layout(
+ showlegend=False,
+ margin=dict(t=50, b=50, l=50, r=50),
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+ width=800 + max_level * 200, # Adjusted width to accommodate more levels
+ height=300 + len(nodes) * 100, # Adjusted height to accommodate more levels
+ plot_bgcolor='rgba(240,240,240,0.2)',
+ paper_bgcolor='white'
+ )
+
+ return fig1
+
+def generate_subplot2(masked_sentences, sampled_sentences, highlight_info, common_grams):
+ """
+ Generates a subplot visualizing multiple masked sentences and their sampled variants in a tree structure.
+ Each masked sentence will have multiple sampled sentences derived from it using different sampling techniques.
+
+ Args:
+ masked_sentences (list of str): A list of masked sentences to be visualized as root nodes.
+ sampled_sentences (list of str): A list of sampled sentences derived from masked sentences.
+ highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting.
+ common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering.
+
+ Returns:
+ plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges.
+ """
+ # Define sampling techniques
+ sampling_techniques = [
+ "Greedy Sampling",
+ "Temperature Sampling",
+ "Exponential Minimum Sampling",
+ "Inverse Transform Sampling"
+ ]
+
+ # Calculate total number of nodes
+ num_masked = len(masked_sentences)
+ num_sampled_per_masked = len(sampling_techniques)
+ total_nodes = num_masked + (num_masked * num_sampled_per_masked)
+
+ # Combine all sentences into nodes list with appropriate labels
+ nodes = []
+ # Level 0: masked sentences (root nodes)
+ nodes.extend([s + ' L0' for s in masked_sentences])
+
+ # Level 1: sampled sentences (branch nodes)
+ # For each masked sentence, we should have samples from each technique
+ sampled_nodes = []
+
+ # Validate if we have the expected number of sampled sentences
+ expected_sampled_count = num_masked * num_sampled_per_masked
+ if len(sampled_sentences) < expected_sampled_count:
+ # If insufficient samples provided, pad with placeholder sentences
+ print(f"Warning: Expected {expected_sampled_count} sampled sentences, but got {len(sampled_sentences)}")
+ while len(sampled_sentences) < expected_sampled_count:
+ sampled_sentences.append(f"Placeholder sampled sentence {len(sampled_sentences) + 1}")
+
+ # Add all sampled sentences with level information
+ for s in sampled_sentences[:expected_sampled_count]:
+ sampled_nodes.append(s + ' L1')
+
+ nodes.extend(sampled_nodes)
+
+ def apply_lcs_numbering(sentence, common_grams):
+ """
+ Applies LCS numbering to the sentence based on the common_grams.
+ """
+ for idx, lcs in common_grams:
+ sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+ return sentence
+
+ # Apply LCS numbering
+ nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+
+ def highlight_words(sentence, color_map):
+ """
+ Highlights words in the sentence based on the color_map.
+ """
+ for word, color in color_map.items():
+ sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+ return sentence
+
+ # Helper function to color highlighted words
+ def color_highlighted_words(node, color_map):
+ """
+ Colors the highlighted words in the node text.
+ """
+ parts = re.split(r'(\{\{.*?\}\})', node)
+ colored_parts = []
+ for part in parts:
+ match = re.match(r'\{\{(.*?)\}\}', part)
+ if match:
+ word = match.group(1)
+ color = color_map.get(word, 'black')
+ colored_parts.append(f"{word}")
+ else:
+ colored_parts.append(part)
+ return ''.join(colored_parts)
+
+ # Clean nodes, highlight words, and wrap text
+ cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+ global_color_map = dict(highlight_info)
+ highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+ wrapped_nodes = ['
'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
+
+ # Generate edges based on the tree structure
+ def get_levels_and_edges(nodes):
+ levels = {}
+ edges = []
+
+ # Extract level info from node labels
+ for i, node in enumerate(nodes):
+ level = int(node.split()[-1][1])
+ levels[i] = level
+
+ # Create edges from masked sentences to their sampled variants
+ for masked_idx in range(num_masked):
+ # For each masked sentence, create edges to its sampled variants
+ for technique_idx in range(num_sampled_per_masked):
+ sampled_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx
+ if sampled_idx < len(nodes):
+ edges.append((masked_idx, sampled_idx))
+
+ return levels, edges
+
+ levels, edges = get_levels_and_edges(nodes)
+
+ # Calculate positions with improved spacing
+ positions = {}
+
+ # Calculate horizontal spacing for the root nodes (masked sentences)
+ root_x_spacing = 0 # All root nodes at x=0
+ root_y_spacing = 8.0 # Vertical spacing between root nodes
+
+ # Calculate positions for sampled nodes
+ sampled_x = 3 # X position for all sampled nodes
+
+ # Calculate y positions for root nodes (masked sentences)
+ root_y_start = -(num_masked - 1) * root_y_spacing / 2
+ for i in range(num_masked):
+ positions[i] = (root_x_spacing, root_y_start + i * root_y_spacing)
+
+ # Calculate y positions for sampled nodes
+ for masked_idx in range(num_masked):
+ root_y = positions[masked_idx][1] # Y position of parent masked sentence
+
+ # Calculate y-spacing for children of this root
+ children_y_spacing = 1.5 # Vertical spacing between children of the same root
+ children_y_start = root_y - (num_sampled_per_masked - 1) * children_y_spacing / 2
+
+ # Position each child
+ for technique_idx in range(num_sampled_per_masked):
+ child_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx
+ child_y = children_y_start + technique_idx * children_y_spacing
+ positions[child_idx] = (sampled_x, child_y)
+
+ # Create figure
+ fig2 = go.Figure()
+
+ # Add nodes
+ for i, node in enumerate(wrapped_nodes):
+ x, y = positions[i]
+
+ # Define node color based on level
+ node_color = 'blue' if levels[i] == 0 else 'green'
+
+ # Add the node marker
+ fig2.add_trace(go.Scatter(
+ x=[x],
+ y=[y],
+ mode='markers',
+ marker=dict(size=20, color=node_color, line=dict(color='black', width=2)),
+ hoverinfo='none'
+ ))
+
+ # Add node label with highlighting
+ colored_node = color_highlighted_words(node, global_color_map)
+
+ fig2.add_annotation(
+ x=x,
+ y=y,
+ text=colored_node,
+ showarrow=False,
+ xshift=15,
+ align="left",
+ font=dict(size=12),
+ bordercolor='black',
+ borderwidth=2,
+ borderpad=4,
+ bgcolor='white',
+ width=400,
+ height=100
+ )
+
+ # Add edges with labels
+ for i, (src, dst) in enumerate(edges):
+ x0, y0 = positions[src]
+ x1, y1 = positions[dst]
+
+ # Draw the edge
+ fig2.add_trace(go.Scatter(
+ x=[x0, x1],
+ y=[y0, y1],
+ mode='lines',
+ line=dict(color='black', width=1)
+ ))
+
+ # Add sampling technique label
+ # Determine which sampling technique this is
+ parent_idx = src
+ technique_count = sum(1 for k, (s, _) in enumerate(edges) if s == parent_idx and k < i)
+ technique_label = sampling_techniques[technique_count % len(sampling_techniques)]
+
+ # Calculate midpoint for the label
+ mid_x = (x0 + x1) / 2
+ mid_y = (y0 + y1) / 2
+
+ # Add slight offset to avoid overlap
+ label_offset = 0.1
+
+ fig2.add_annotation(
+ x=mid_x,
+ y=mid_y + label_offset,
+ text=technique_label,
+ showarrow=False,
+ font=dict(size=8),
+ align="center"
+ )
+
+ # Update layout
+ fig2.update_layout(
+ showlegend=False,
+ margin=dict(t=20, b=20, l=20, r=20),
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+ width=1200, # Adjusted width to accommodate more levels
+ height=2000, # Adjusted height to accommodate more levels
+ plot_bgcolor='rgba(240,240,240,0.2)',
+ paper_bgcolor='white'
+
+ )
+
+ return fig2
+
+if __name__ == "__main__":
+ paraphrased_sentence = "The quick brown fox jumps over the lazy dog."
+ masked_sentences = [
+ "A fast brown fox leaps over the lazy dog.",
+ "A quick brown fox hops over a lazy dog."
+ ]
+ highlight_info = [
+ ("quick", "red"),
+ ("brown", "green"),
+ ("fox", "blue"),
+ ("lazy", "purple")
+ ]
+ common_grams = [
+ (1, "quick brown fox"),
+ (2, "lazy dog")
+ ]
+
+ fig1 = generate_subplot1(paraphrased_sentence, masked_sentences, highlight_info, common_grams)
+ fig1.show()
+
+ sampled_sentence = ["A fast brown fox jumps over a lazy dog."]
+
+
+ fig2 = generate_subplot2(masked_sentences, sampled_sentence, highlight_info, common_grams)
+ fig2.show()
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..576b038724ea2b6389f67fec2ebdbe2e21468e03
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,5 @@
+from utils.watermark import Watermarker
+from utils.paraphraser import Paraphraser
+from utils.entailment import EntailmentAnalyzer
+from utils.sampling import SamplingProcessor
+from utils.config import load_config
\ No newline at end of file
diff --git a/utils/__pycache__/__init__.cpython-310.pyc b/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30ce4a254bd80ace4f22d103dd473a1cf9b3283e
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/utils/__pycache__/__init__.cpython-311.pyc b/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5555a79f7cb76bcd3e4f723e9028acf1f26933
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/utils/__pycache__/config.cpython-310.pyc b/utils/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3582700fa53896a8194864be186753ee3a8ab82
Binary files /dev/null and b/utils/__pycache__/config.cpython-310.pyc differ
diff --git a/utils/__pycache__/config.cpython-311.pyc b/utils/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c96b59a35d6728248efd21863acee2f81f1a645a
Binary files /dev/null and b/utils/__pycache__/config.cpython-311.pyc differ
diff --git a/utils/__pycache__/entailment.cpython-310.pyc b/utils/__pycache__/entailment.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68ad206c84236b0a2ec5d00f2bfe9ee69dc67e3c
Binary files /dev/null and b/utils/__pycache__/entailment.cpython-310.pyc differ
diff --git a/utils/__pycache__/entailment.cpython-311.pyc b/utils/__pycache__/entailment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6fc993c2719d8063a2bba33273f7adc968e2950
Binary files /dev/null and b/utils/__pycache__/entailment.cpython-311.pyc differ
diff --git a/utils/__pycache__/masking_methods.cpython-310.pyc b/utils/__pycache__/masking_methods.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b99bf858b1d9cdc684d8d785f7d7c28733a7240
Binary files /dev/null and b/utils/__pycache__/masking_methods.cpython-310.pyc differ
diff --git a/utils/__pycache__/masking_methods.cpython-311.pyc b/utils/__pycache__/masking_methods.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..018ef6f3519408daa5502ad0905b3d207341d76a
Binary files /dev/null and b/utils/__pycache__/masking_methods.cpython-311.pyc differ
diff --git a/utils/__pycache__/non_melting_point.cpython-310.pyc b/utils/__pycache__/non_melting_point.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23bd50bb431040f19671e32be8856a325ab25f80
Binary files /dev/null and b/utils/__pycache__/non_melting_point.cpython-310.pyc differ
diff --git a/utils/__pycache__/non_melting_point.cpython-311.pyc b/utils/__pycache__/non_melting_point.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdb66bfaa7274f08f281416d1ada8bb975503381
Binary files /dev/null and b/utils/__pycache__/non_melting_point.cpython-311.pyc differ
diff --git a/utils/__pycache__/paraphraser.cpython-310.pyc b/utils/__pycache__/paraphraser.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aadaec55f8b172edcbd271df987813ef13c80eb4
Binary files /dev/null and b/utils/__pycache__/paraphraser.cpython-310.pyc differ
diff --git a/utils/__pycache__/paraphraser.cpython-311.pyc b/utils/__pycache__/paraphraser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06362f19d3beb20b5acf6e9f37130b73f000510c
Binary files /dev/null and b/utils/__pycache__/paraphraser.cpython-311.pyc differ
diff --git a/utils/__pycache__/sampling.cpython-310.pyc b/utils/__pycache__/sampling.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec8afbc843e5e7d97325fa9438f2fd686c5743b0
Binary files /dev/null and b/utils/__pycache__/sampling.cpython-310.pyc differ
diff --git a/utils/__pycache__/sampling.cpython-311.pyc b/utils/__pycache__/sampling.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d44f57068e2d56d8db99e96f55526db27eeb78
Binary files /dev/null and b/utils/__pycache__/sampling.cpython-311.pyc differ
diff --git a/utils/__pycache__/watermark.cpython-310.pyc b/utils/__pycache__/watermark.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9943c62cbae737b816cd03eb62e9229ab1a3ba0e
Binary files /dev/null and b/utils/__pycache__/watermark.cpython-310.pyc differ
diff --git a/utils/__pycache__/watermark.cpython-311.pyc b/utils/__pycache__/watermark.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb5322d289d38244fa4f36d3acac5cb06b934edb
Binary files /dev/null and b/utils/__pycache__/watermark.cpython-311.pyc differ
diff --git a/utils/config.py b/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d2997275e03a0ab219f78f65482d7cead8b5ebd
--- /dev/null
+++ b/utils/config.py
@@ -0,0 +1,18 @@
+"""
+This file loads config from config.yaml
+"""
+
+import yaml
+
+def load_config(path):
+ """
+ Function to load config from config.yaml
+ """
+ try:
+ with open(path, "r") as file:
+ config = yaml.safe_load(file)
+ return config
+ except FileNotFoundError:
+ raise FileNotFoundError("Config file not found")
+ except Exception as e:
+ raise e
diff --git a/utils/config.yaml b/utils/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f91e3fcf8a5ed368d540256d4a2489a4752f9b34
--- /dev/null
+++ b/utils/config.yaml
@@ -0,0 +1,48 @@
+# This is the official config file.
+PECCAVI_TEXT:
+ Entailment:
+ task: "text-classification"
+ model: "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
+
+ Masking:
+ task: "fill-mask"
+ tokenizer: "bert-base-uncased"
+ model: "bert-base-uncased"
+ # tokenizer: "bert-large-cased-whole-word-masking"
+ # model: "bert-large-cased-whole-word-masking"
+
+ Vocabulary:
+ tokenizer: "bert-base-uncased"
+ model: "bert-base-uncased"
+ # permissible_ratio: 0.5
+ # tokenizer: "bert-large-cased-whole-word-masking"
+ # model: "bert-large-cased-whole-word-masking"
+ permissible_ratio: 1.0
+
+ Sampling:
+ tokenizer: "bert-base-uncased"
+ model: "bert-base-uncased"
+ # tokenizer: "bert-large-cased-whole-word-masking"
+ # model: "bert-large-cased-whole-word-masking"
+
+ Metrics:
+ EuclideanDistance: "sentence-transformers/all-MiniLM-L6-v2"
+ Distortion: "gpt2"
+
+ Detector:
+ tokenizer: "bert-base-uncased"
+ model: "bert-base-uncased"
+ # tokenizer: "bert-large-cased-whole-word-masking"
+ # model: "bert-large-cased-whole-word-masking"
+
+ Paraphrase:
+ tokenizer: "humarin/chatgpt_paraphraser_on_T5_base"
+ model: "humarin/chatgpt_paraphraser_on_T5_base"
+ num_beams: 10
+ num_beam_groups: 10
+ num_return_sequences: 10
+ repetition_penalty: 10.0
+ diversity_penalty: 3.0
+ no_repeat_ngram_size: 2
+ temperature: 0.7
+ max_length: 64
diff --git a/utils/entailment.py b/utils/entailment.py
new file mode 100644
index 0000000000000000000000000000000000000000..aba1608ea349ffd7649d0c83a053a6433d8e489a
--- /dev/null
+++ b/utils/entailment.py
@@ -0,0 +1,107 @@
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+import numpy as np
+from transformers import pipeline
+from typing import List
+from utils.config import load_config
+
+
+class EntailmentAnalyzer:
+ # def __init__(self, config_path: str):
+ def __init__(self, config):
+ """
+ Initialize the EntailmentAnalyzer with the config file path.
+
+ Args:
+ config_path: The path to the configuration file.
+ """
+ # self.config = load_config(config_path)['PECCAVI_TEXT']['Entailment']
+ self.config = config
+ self.entailment_pipeline = pipeline(task=self.config['task'], model=self.config['model'])
+
+ def check_entailment(self, premise: str, hypothesis: str) -> float:
+ """
+ Check entailment between the premise and hypothesis.
+
+ Args:
+ premise: The premise sentence.
+ hypothesis: The hypothesis sentence.
+
+ Returns:
+ float: The entailment score.
+ """
+ results = self.entailment_pipeline(f"{premise} [SEP] {hypothesis}", top_k=None)
+ entailment_score = next(item['score'] for item in results if item['label'] == 'entailment')
+ return entailment_score
+
+ def analyze_entailment(self, original_sentence: str, paraphrased_sentences: List[str], threshold: float) -> tuple:
+ """
+ Analyze entailment scores for paraphrased sentences. If no selected sentences are found,
+ lower the threshold and rerun the analysis.
+
+ Args:
+ original_sentence: The original sentence.
+ paraphrased_sentences: List of paraphrased sentences.
+ threshold: Minimum score to select a sentence.
+
+ Returns:
+ tuple: A dictionary of all scores, selected sentences, and discarded sentences.
+ """
+ all_sentences = {}
+ selected_sentences = {}
+ discarded_sentences = {}
+
+ # Loop to reduce threshold if no sentences are selected
+ while not selected_sentences:
+ for paraphrased_sentence in paraphrased_sentences:
+ entailment_score = self.check_entailment(original_sentence, paraphrased_sentence)
+
+ all_sentences[paraphrased_sentence] = entailment_score
+ if entailment_score >= threshold:
+ selected_sentences[paraphrased_sentence] = entailment_score
+ else:
+ discarded_sentences[paraphrased_sentence] = entailment_score
+
+ # If no sentences are selected, lower the threshold
+ if not selected_sentences:
+ print(f"No selected sentences found. Lowering the threshold by 0.1 (from {threshold} to {threshold - 0.1}).")
+ threshold -= 0.1
+ if threshold <= 0:
+ print("Threshold has reached 0. No sentences meet the criteria.")
+ break
+
+ return all_sentences, selected_sentences, discarded_sentences
+
+
+if __name__ == "__main__":
+ config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+
+ config_path = '/home/ashhar21137/text_wm/scratch/utils/config/config.yaml'
+
+ config = load_config(config_path)
+
+ entailment_analyzer = EntailmentAnalyzer(config['PECCAVI_TEXT']['Entailment'])
+
+ all_sentences, selected_sentences, discarded_sentences = entailment_analyzer.analyze_entailment(
+ "The weather is nice today",
+ [
+ "The climate is pleasant today",
+ "It's a good day weather-wise",
+ "Today, the weather is terrible",
+ "What a beautiful day it is",
+ "The sky is clear and the weather is perfect",
+ "It's pouring rain outside today",
+ "The weather isn't bad today",
+ "A lovely day for outdoor activities"
+ ],
+ 0.7
+ )
+
+ print("----------------------- All Sentences -----------------------")
+ print(all_sentences)
+ print("----------------------- Discarded Sentences -----------------------")
+ print(discarded_sentences)
+ print("----------------------- Selected Sentences -----------------------")
+ print(selected_sentences)
diff --git a/utils/masking_methods.py b/utils/masking_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..987afb2b026d04467b0c4e73486fd42961e2cb4f
--- /dev/null
+++ b/utils/masking_methods.py
@@ -0,0 +1,304 @@
+import random
+import torch
+import logging
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+from transformers import RobertaTokenizer, RobertaForMaskedLM
+from tqdm import tqdm
+
+# Set logging to WARNING for a cleaner terminal.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# Ensure stopwords are downloaded
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ nltk.download('stopwords')
+
+class MaskingProcessor:
+ def __init__(self, tokenizer, model):
+ self.tokenizer = tokenizer
+ self.model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ self.stop_words = set(stopwords.words('english'))
+ tqdm.write(f"[MaskingProcessor] Initialized on device: {self.device}")
+
+ def remove_stopwords(self, words):
+ return [word for word in words if word.lower() not in self.stop_words]
+
+ def adjust_ngram_indices(self, original_words, common_ngrams):
+ logger.info("Adjusting n-gram indices.")
+ non_stop_words = self.remove_stopwords(original_words)
+ original_to_non_stop = []
+ non_stop_idx = 0
+ for original_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ original_to_non_stop.append((original_idx, non_stop_idx))
+ non_stop_idx += 1
+ adjusted_ngrams = {}
+ for ngram, positions in common_ngrams.items():
+ adjusted_positions = []
+ for start, end in positions:
+ try:
+ new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+ new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+ adjusted_positions.append((new_start, new_end))
+ except StopIteration:
+ continue
+ adjusted_ngrams[ngram] = adjusted_positions
+ return adjusted_ngrams
+
+ def mask_sentence_random(self, sentence, common_ngrams):
+ tqdm.write(f"[MaskingProcessor] Masking (random) sentence: {sentence}")
+ original_words = sentence.split()
+ has_punctuation = False
+ punctuation = ''
+ if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ has_punctuation = True
+ punctuation = original_words[-1][-1]
+ original_words = original_words[:-1]
+
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ mask_indices = []
+
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+ if ngram_positions:
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ mask_indices.append(mask_index_before_ngram)
+
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ mask_indices.append(mask_index_between_ngrams)
+
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ mask_indices.append(mask_index_after_ngram)
+
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ if has_punctuation:
+ masked_words.append(punctuation)
+
+ logger.info(f"Masked sentence (random): {' '.join(masked_words)}")
+ return " ".join(masked_words), original_mask_indices
+
+ def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+ logger.info(f"Masking sentence using pseudorandom strategy: {sentence}")
+ random.seed(3)
+ original_words = sentence.split()
+ has_punctuation = False
+ punctuation = ''
+ if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ has_punctuation = True
+ punctuation = original_words[-1][-1]
+ original_words = original_words[:-1]
+
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ mask_indices = []
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+
+ if ngram_positions:
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ mask_indices.append(mask_index_before_ngram)
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ mask_indices.append(mask_index_between_ngrams)
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ mask_indices.append(mask_index_after_ngram)
+
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ if has_punctuation:
+ masked_words.append(punctuation)
+
+ logger.info(f"Masked sentence (pseudorandom): {' '.join(masked_words)}")
+ return " ".join(masked_words), original_mask_indices
+
+ def mask_sentence_entropy(self, sentence, common_ngrams):
+ logger.info(f"Masking sentence using entropy strategy: {sentence}")
+ original_words = sentence.split()
+ has_punctuation = False
+ punctuation = ''
+ if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ has_punctuation = True
+ punctuation = original_words[-1][-1]
+ original_words = original_words[:-1]
+
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ mask_indices = []
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ if ngram_positions:
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ candidate_positions = range(0, first_ngram_start)
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ candidate_positions = range(end_prev + 1, start_next)
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ if has_punctuation:
+ masked_words.append(punctuation)
+
+ logger.info(f"Masked sentence (entropy): {' '.join(masked_words)}")
+ return " ".join(masked_words), original_mask_indices
+
+ def calculate_mask_logits(self, original_sentence, original_mask_indices):
+ logger.info(f"Calculating mask logits for sentence: {original_sentence}")
+ words = original_sentence.split()
+ mask_logits = {}
+ for idx in original_mask_indices:
+ masked_words = words.copy()
+ masked_words[idx] = self.tokenizer.mask_token
+ masked_sentence = " ".join(masked_words)
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device)
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+ mask_logits_tensor = logits[0, mask_token_index, :]
+ top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1)
+ top_tokens = []
+ top_logits = []
+ seen_words = set()
+ for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]):
+ token = self.tokenizer.convert_ids_to_tokens(token_id.item())
+ if token.startswith('##'):
+ continue
+ word = self.tokenizer.convert_tokens_to_string([token]).strip()
+ if word and word not in seen_words:
+ seen_words.add(word)
+ top_tokens.append(word)
+ top_logits.append(logit.item())
+ if len(top_tokens) == 50:
+ break
+ mask_logits[idx] = {
+ "tokens": top_tokens,
+ "logits": top_logits
+ }
+ logger.info("Completed calculating mask logits.")
+ return mask_logits
+
+ def calculate_word_entropy(self, sentence, word_position):
+ logger.info(f"Calculating word entropy for position {word_position} in sentence: {sentence}")
+ words = sentence.split()
+ masked_words = words.copy()
+ masked_words[word_position] = self.tokenizer.mask_token
+ masked_sentence = " ".join(masked_words)
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device)
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+ probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+ entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+ logger.info(f"Computed entropy: {entropy.item()}")
+ return entropy.item()
+
+ def process_sentences(self, sentences_list, common_grams, method="random"):
+ tqdm.write(f"[MaskingProcessor] Processing sentences using method: {method}")
+ results = {}
+ for sentence, ngrams in tqdm(common_grams.items(), desc="Masking Sentences"):
+ words = sentence.split()
+ last_word = words[-1]
+ if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ words[-1] = last_word[:-1]
+ punctuation = last_word[-1]
+ processed_sentence = " ".join(words) + " " + punctuation
+ else:
+ processed_sentence = sentence
+
+ if method == "random":
+ masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams)
+ elif method == "pseudorandom":
+ masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams)
+ else: # entropy
+ masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams)
+
+ logits = self.calculate_mask_logits(processed_sentence, original_mask_indices)
+ results[sentence] = {
+ "masked_sentence": masked_sentence,
+ "mask_logits": logits
+ }
+ logger.info(f"Processed sentence: {sentence}")
+ tqdm.write("[MaskingProcessor] Completed processing sentences.")
+ return results
+
+if __name__ == "__main__":
+ sentences = [
+ "The quick brown fox jumps over small cat the lazy dog everyday again and again .",
+ ]
+ result_dict = {
+ 'The quick brown fox jumps over small cat the lazy dog everyday again and again .': {
+ 'brown fox': [(2, 3)],
+ 'cat': [(7, 7)],
+ 'dog': [(10, 10)]
+ }
+ }
+ processor = MaskingProcessor(
+ BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking"),
+ BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+ )
+ results_entropy = processor.process_sentences(sentences_list, common_grams, method="random")
+ for sentence, output in results_entropy.items():
+ logger.info(f"Original Sentence (Random): {sentence}")
+ logger.info(f"Masked Sentence (Random): {output['masked_sentence']}")
diff --git a/utils/non_melting_point.py b/utils/non_melting_point.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61788f5c06a22ec14d2c9af94cdff8fcee819d6
--- /dev/null
+++ b/utils/non_melting_point.py
@@ -0,0 +1,137 @@
+import nltk
+import logging
+from nltk.corpus import stopwords
+from nltk.util import ngrams
+from collections import Counter
+import re
+from tqdm import tqdm
+
+# Set logging to WARNING for minimal console output.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+class NgramProcessor:
+ def __init__(self):
+ try:
+ nltk.data.find('corpora/stopwords')
+ except LookupError:
+ nltk.download('stopwords')
+ self.stop_words = set(stopwords.words('english'))
+ tqdm.write("[NgramProcessor] Initialized with stopwords.")
+
+ def remove_stopwords(self, text):
+ # No need for extensive logging inside this helper.
+ words = re.findall(r'\w+', text.lower())
+ filtered_words = [word for word in words if word not in self.stop_words]
+ return ' '.join(filtered_words)
+
+ def is_exact_match(self, ngram, sentences):
+ logger.info(f"Checking exact match for ngram: {ngram}")
+ result = all(ngram in sentence for sentence in sentences)
+ logger.info(f"Exact match result for '{ngram}': {result}")
+ return result
+
+ def is_substring_of_any(self, ngram, common_ngrams):
+ logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
+ result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+ logger.info(f"Substring check result for '{ngram}': {result}")
+ return result
+
+ def find_filtered_ngrams(self, sentences):
+ from collections import Counter
+ tqdm.write("[NgramProcessor] Cleaning sentences...")
+ sentences_cleaned = [self.remove_stopwords(sentence)
+ for sentence in tqdm(sentences, desc="Cleaning Sentences")]
+ ngram_lengths = [4, 3, 2, 1]
+ common_ngrams = []
+ result = {}
+ for n in ngram_lengths:
+ ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
+ ngrams_counter = Counter(ngrams_list[0])
+ for ngram in ngrams_counter:
+ ngram_str = ' '.join(ngram)
+ if any(word in self.stop_words for word in ngram_str.split()):
+ continue
+ if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
+ common_ngrams.append(ngram_str)
+ for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
+ total=len(sentences),
+ desc="Mapping N-grams"):
+ sentence_result = {}
+ original_words = sentence.split()
+ cleaned_words = cleaned_sentence.split()
+ index_map = {}
+ cleaned_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ index_map[cleaned_idx] = orig_idx
+ cleaned_idx += 1
+ for ngram in common_ngrams:
+ ngram_words = ngram.split()
+ indices = []
+ for i in range(len(cleaned_words) - len(ngram_words) + 1):
+ if cleaned_words[i:i + len(ngram_words)] == ngram_words:
+ if i in index_map:
+ start_idx = index_map[i]
+ end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
+ if end_idx - start_idx == len(ngram_words) - 1:
+ indices.append((start_idx, end_idx))
+
+ if indices:
+ sentence_result[ngram] = indices
+ result[sentence] = sentence_result
+ return result
+
+ # def find_relative_order(self, sentence, common_ngrams):
+ # from tqdm import tqdm
+ # relative_order = []
+ # for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
+ # index = sentence.find(ngram)
+ # if index != -1:
+ # relative_order.append((index, ngram))
+ # return sorted(relative_order)
+
+ def find_relative_order(self, sentence, common_ngrams):
+ from tqdm import tqdm
+ sentence = sentence.lower()
+ relative_order = []
+
+ for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
+ index = sentence.find(ngram.lower())
+ if index != -1:
+ relative_order.append((index, ngram))
+
+ sorted_pairs = sorted(relative_order)
+ return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]
+
+# Example usage
+if __name__ == "__main__":
+ sentences = [
+ "The quick brown fox jumps over the lazy dog .",
+ "A speedy brown fox jumps over a lazy dog.",
+ "A swift brown fox leaps over the lethargic dog.",
+ ]
+ processor = NgramProcessor()
+ common_ngrams = processor.find_filtered_ngrams(sentences)
+ print(common_ngrams)
+ # modified_output = list({
+ # (indices[0][0], gram)
+ # for grams in common_ngrams.values()
+ # for gram, indices in grams.items()
+ # })
+ # print(modified_output)
+ logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
+ for sentence in sentences:
+ order = processor.find_relative_order(sentence, common_ngrams[sentence])
+ logger.info(f"Sentence: {sentence} -> Order: {order}")
+
+
+"""
+
+{
+'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
+'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
+'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
+}
+"""
+
diff --git a/utils/old/masking/masking_methods.py b/utils/old/masking/masking_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd80c84a38f205374a5b911d1e16a8d0fc27d2a0
--- /dev/null
+++ b/utils/old/masking/masking_methods.py
@@ -0,0 +1,355 @@
+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+
+# Ensure stopwords are downloaded
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ nltk.download('stopwords')
+
+class MaskingProcessor:
+ def __init__(self, ):
+ self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+ self.stop_words = set(stopwords.words('english'))
+
+ def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords):
+ """
+ Adjust indices of common n-grams after removing stop words.
+
+ Args:
+ words (list): List of words in the original sentence.
+ common_ngrams (dict): Common n-grams and their indices.
+
+ Returns:
+ dict: Adjusted common n-grams and their indices.
+ """
+ if not remove_stopwords:
+ return common_ngrams
+
+ non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+ adjusted_ngrams = {}
+
+ for ngram, positions in common_ngrams.items():
+ adjusted_positions = []
+ for start, end in positions:
+ try:
+ new_start = non_stop_word_indices.index(start)
+ new_end = non_stop_word_indices.index(end)
+ adjusted_positions.append((new_start, new_end))
+ except ValueError:
+ continue # Skip if indices cannot be mapped
+ adjusted_ngrams[ngram] = adjusted_positions
+
+ return adjusted_ngrams
+
+ # def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+ # """
+ # Mask one word before the first common n-gram, one between two n-grams,
+ # and one after the last common n-gram (random selection).
+
+ # Args:
+ # original_sentence (str): Original sentence
+ # common_ngrams (dict): Common n-grams and their indices
+
+ # Returns:
+ # str: Masked sentence with original stop words retained
+ # """
+ # words = original_sentence.split()
+ # if remove_stopwords:
+ # non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+ # non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+ # else:
+ # non_stop_words = words
+ # non_stop_word_indices = list(range(len(words)))
+ # # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+ # adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+
+ # mask_indices = []
+ # # Handle before the first common n-gram
+ # if adjusted_ngrams:
+ # first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+ # if first_ngram_start > 0:
+ # mask_indices.append(random.randint(0, first_ngram_start - 1))
+
+ # # Handle between common n-grams
+ # ngram_positions = list(adjusted_ngrams.values())
+ # for i in range(len(ngram_positions) - 1):
+ # end_prev = ngram_positions[i][-1][1]
+ # start_next = ngram_positions[i + 1][0][0]
+ # if start_next > end_prev + 1:
+ # mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+
+ # # Handle after the last common n-gram
+ # last_ngram_end = ngram_positions[-1][-1][1]
+ # if last_ngram_end < len(non_stop_words) - 1:
+ # mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1))
+
+ # # Mask the chosen indices
+ # original_masked_sentence = words[:]
+ # # for idx in mask_indices:
+ # # if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ # # non_stop_words[idx] = self.tokenizer.mask_token
+ # # original_masked_sentence[idx] = self.tokenizer.mask_token
+ # for idx in mask_indices:
+ # if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ # continue # Skip if index belongs to common n-grams
+ # if remove_stopwords:
+ # original_idx = non_stop_word_indices[idx] # Map back to original indices
+ # original_masked_sentence[original_idx] = self.tokenizer.mask_token
+ # else:
+ # original_masked_sentence[idx] = self.tokenizer.mask_token
+
+
+ # return " ".join(original_masked_sentence)
+ def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+ """
+ Mask one word before the first common n-gram, one between two n-grams,
+ and one after the last common n-gram (random selection).
+
+ Args:
+ original_sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+ remove_stopwords (bool): Whether to remove stop words
+
+ Returns:
+ str: Masked sentence with original stop words retained
+ """
+ words = original_sentence.split()
+ if remove_stopwords:
+ non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+ non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+ else:
+ non_stop_words = words
+ non_stop_word_indices = list(range(len(words)))
+
+ adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+
+ # Collect all indices corresponding to common n-grams
+ common_ngram_indices = {
+ idx for ngram_positions in adjusted_ngrams.values()
+ for start, end in ngram_positions
+ for idx in range(start, end + 1)
+ }
+
+ mask_indices = []
+ # Handle before the first common n-gram
+ if adjusted_ngrams:
+ first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+ if first_ngram_start > 0:
+ potential_indices = [i for i in range(first_ngram_start) if i not in common_ngram_indices]
+ if potential_indices:
+ mask_indices.append(random.choice(potential_indices))
+
+ # Handle between common n-grams
+ ngram_positions = list(adjusted_ngrams.values())
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][-1][1]
+ start_next = ngram_positions[i + 1][0][0]
+ potential_indices = [i for i in range(end_prev + 1, start_next) if i not in common_ngram_indices]
+ if potential_indices:
+ mask_indices.append(random.choice(potential_indices))
+
+ # Handle after the last common n-gram
+ last_ngram_end = ngram_positions[-1][-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ potential_indices = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i not in common_ngram_indices]
+ if potential_indices:
+ mask_indices.append(random.choice(potential_indices))
+
+ # Mask the chosen indices
+ original_masked_sentence = words[:]
+ for idx in mask_indices:
+ if remove_stopwords:
+ original_idx = non_stop_word_indices[idx] # Map back to original indices
+ original_masked_sentence[original_idx] = self.tokenizer.mask_token
+ else:
+ original_masked_sentence[idx] = self.tokenizer.mask_token
+
+ return " ".join(original_masked_sentence)
+
+ def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords):
+ """
+ Mask one word before the first common n-gram, one between two n-grams,
+ and one after the last common n-gram (highest entropy selection).
+
+ Args:
+ original_sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence with original stop words retained
+ """
+ words = original_sentence.split()
+ # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+ if remove_stopwords:
+ non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+ non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+ else:
+ non_stop_words = words
+ non_stop_word_indices = list(range(len(words)))
+ adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+ entropy_scores = {}
+
+ for idx, word in enumerate(non_stop_words):
+ if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ continue # Skip words in common n-grams
+
+ masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:]
+ masked_sentence = " ".join(masked_sentence)
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ filtered_logits = logits[0, mask_token_index, :]
+ probs = torch.softmax(filtered_logits, dim=-1)
+ entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item() # Add epsilon to prevent log(0)
+ entropy_scores[idx] = entropy
+
+ mask_indices = []
+
+ # Handle before the first common n-gram
+ if adjusted_ngrams:
+ first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+ candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Handle between common n-grams
+ ngram_positions = list(adjusted_ngrams.values())
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][-1][1]
+ start_next = ngram_positions[i + 1][0][0]
+ candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Handle after the last common n-gram
+ last_ngram_end = ngram_positions[-1][-1][1]
+ candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Mask the chosen indices
+ original_masked_sentence = words[:]
+ # for idx in mask_indices:
+ # non_stop_words[idx] = self.tokenizer.mask_token
+ # original_masked_sentence[idx] = self.tokenizer.mask_token
+
+ for idx in mask_indices:
+ if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ continue # Skip if index belongs to common n-grams
+ if remove_stopwords:
+ original_idx = non_stop_word_indices[idx] # Map back to original indices
+ original_masked_sentence[original_idx] = self.tokenizer.mask_token
+ else:
+ original_masked_sentence[idx] = self.tokenizer.mask_token
+
+
+ return " ".join(original_masked_sentence)
+
+ def calculate_mask_logits(self, masked_sentence):
+ """
+ Calculate logits for masked tokens in the sentence using BERT.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens
+
+ Returns:
+ dict: Masked token indices and their logits
+ """
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+ return mask_logits
+
+ def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False):
+ """
+ Process a list of sentences and calculate logits for masked tokens using the specified method.
+
+ Args:
+ original_sentences (list): List of original sentences
+ result_dict (dict): Common n-grams and their indices for each sentence
+ method (str): Masking method ("random" or "entropy")
+
+ Returns:
+ dict: Masked sentences and their logits for each sentence
+ """
+ results = {}
+
+ for sentence, ngrams in result_dict.items():
+ if method == "random":
+ masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords)
+ elif method == "entropy":
+ masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords)
+ else:
+ raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+
+ logits = self.calculate_mask_logits(masked_sentence)
+ results[sentence] = {
+ "masked_sentence": masked_sentence,
+ "mask_logits": logits
+ }
+
+ return results
+
+# Example usage
+if __name__ == "__main__":
+ # !!! Working both the cases regardless if the stopword is removed or not
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A speedy brown fox jumps over a lazy dog.",
+ "A swift brown fox leaps over the lethargic dog."
+ ]
+ result_dict ={
+ 'The quick brown fox jumps over the lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+ }
+
+
+ processor = MaskingProcessor()
+ results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=True)
+ # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+
+ for sentence, output in results_random.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {output['masked_sentence']}")
+ # # print(f"Mask Logits (Random): {output['mask_logits']}")
+ # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+ # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+ # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+ print('--------------------------------')
+ # for mask_idx, logits in output["mask_logits"].items():
+ # print(f"Logits for [MASK] at position {mask_idx}:")
+ # print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens
+
+
+
+
+ # result_dict = {
+ # "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ # "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ # "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+ # }
+
+
+ # print('--------------------------------')
+ # for sentence, output in results_entropy.items():
+ # print(f"Original Sentence (Entropy): {sentence}")
+ # print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+ # # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+ # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+ # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+ # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
\ No newline at end of file
diff --git a/utils/old/masking/masking_methods_new_work.py b/utils/old/masking/masking_methods_new_work.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72482bce55f9257f68134017abb2076dd2d24a1
--- /dev/null
+++ b/utils/old/masking/masking_methods_new_work.py
@@ -0,0 +1,447 @@
+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+
+# Ensure stopwords are downloaded
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ nltk.download('stopwords')
+
+class MaskingProcessor:
+ def __init__(self):
+
+ self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+ self.stop_words = set(stopwords.words('english'))
+
+ def remove_stopwords(self, words):
+ """
+ Remove stopwords from the given list of words.
+
+ Args:
+ words (list): List of words.
+
+ Returns:
+ list: List of non-stop words.
+ """
+ return [word for word in words if word.lower() not in self.stop_words]
+
+ def adjust_ngram_indices(self, original_words, common_ngrams):
+ """
+ Adjust indices of common n-grams after removing stopwords.
+
+ Args:
+ original_words (list): Original list of words.
+ common_ngrams (dict): Common n-grams and their indices.
+
+ Returns:
+ dict: Adjusted common n-grams with updated indices.
+ """
+ non_stop_words = self.remove_stopwords(original_words)
+ original_to_non_stop = []
+ non_stop_idx = 0
+
+ for original_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ original_to_non_stop.append((original_idx, non_stop_idx))
+ non_stop_idx += 1
+
+ adjusted_ngrams = {}
+ for ngram, positions in common_ngrams.items():
+ adjusted_positions = []
+ for start, end in positions:
+ try:
+ new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+ new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+ adjusted_positions.append((new_start, new_end))
+ except StopIteration:
+ continue # Skip if indices cannot be mapped
+ adjusted_ngrams[ngram] = adjusted_positions
+
+ return adjusted_ngrams
+
+ def mask_sentence_random(self, sentence, common_ngrams):
+ """
+ Mask words in the sentence based on the specified rules after removing stopwords.
+ """
+ original_words = sentence.split()
+ print(f' ---- original_words : {original_words} ----- ')
+ non_stop_words = self.remove_stopwords(original_words)
+ print(f' ---- non_stop_words : {non_stop_words} ----- ')
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ print(f' ---- common_ngrams : {common_ngrams} ----- ')
+ print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+
+ mask_indices = []
+
+ # Extract n-gram positions in non-stop words
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+
+ # Mask a word before the first common n-gram
+ if ngram_positions:
+ print(f' ---- ngram_positions : {ngram_positions} ----- ')
+ first_ngram_start = ngram_positions[0][0]
+ print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+ if first_ngram_start > 0:
+ mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+ mask_indices.append(mask_index_before_ngram)
+
+ # Mask words between common n-grams
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ print(f' ---- end_prev : {end_prev} ----- ') # END INDICE FROM PREV LOOP FUNKNLKNLKNLKNLKNLKNLSKDNFLKSDHJFLSDJKFH:KLSDHF:LHKSDF:HJKLDFS:HJKLDFSHJK:
+ start_next = ngram_positions[i + 1][0]
+ print(f' ---- start_next : {start_next} ----- ')
+ if start_next > end_prev + 1:
+ mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+ mask_indices.append(mask_index_between_ngrams)
+
+ # Mask a word after the last common n-gram
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+ mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+ mask_indices.append(mask_index_after_ngram)
+
+ # Create mapping from non-stop words to original indices
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ # Map mask indices from non-stop word positions to original positions
+ print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+
+ # Apply masks to the original sentence
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ return " ".join(masked_words)
+
+ def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+ """
+ Mask words in the sentence based on the specified rules after removing stopwords.
+ """
+ random.seed(42)
+ original_words = sentence.split()
+ print(f' ---- original_words : {original_words} ----- ')
+ non_stop_words = self.remove_stopwords(original_words)
+ print(f' ---- non_stop_words : {non_stop_words} ----- ')
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ print(f' ---- common_ngrams : {common_ngrams} ----- ')
+ print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+
+ mask_indices = []
+
+ # Extract n-gram positions in non-stop words
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+
+ # Mask a word before the first common n-gram
+ if ngram_positions:
+ print(f' ---- ngram_positions : {ngram_positions} ----- ')
+ first_ngram_start = ngram_positions[0][0]
+ print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+ if first_ngram_start > 0:
+ mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+ mask_indices.append(mask_index_before_ngram)
+
+ # Mask words between common n-grams
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ print(f' ---- end_prev : {end_prev} ----- ')
+ start_next = ngram_positions[i + 1][0]
+ print(f' ---- start_next : {start_next} ----- ')
+ if start_next > end_prev + 1:
+ mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+ mask_indices.append(mask_index_between_ngrams)
+
+ # Mask a word after the last common n-gram
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+ mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+ mask_indices.append(mask_index_after_ngram)
+
+ # Create mapping from non-stop words to original indices
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ # Map mask indices from non-stop word positions to original positions
+ print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+
+ # Apply masks to the original sentence
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ return " ".join(masked_words)
+
+
+ def calculate_word_entropy(self, sentence, word_position):
+ """
+ Calculate entropy for a specific word position in the sentence.
+
+ Args:
+ sentence (str): The input sentence
+ word_position (int): Position of the word to calculate entropy for
+
+ Returns:
+ float: Entropy value for the word
+ """
+ words = sentence.split()
+ masked_words = words.copy()
+ masked_words[word_position] = self.tokenizer.mask_token
+ masked_sentence = " ".join(masked_words)
+
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ # Get probabilities for the masked position
+ probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+ # Calculate entropy: -sum(p * log(p))
+ entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+
+ return entropy.item()
+
+ def mask_sentence_entropy(self, sentence, common_ngrams):
+ """
+ Mask words in the sentence based on entropy, following n-gram positioning rules.
+
+ Args:
+ sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence
+ """
+ original_words = sentence.split()
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+
+ # Create mapping from non-stop words to original indices
+ non_stop_to_original = {}
+ original_to_non_stop = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ original_to_non_stop[orig_idx] = non_stop_idx
+ non_stop_idx += 1
+
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+ mask_indices = []
+
+ if ngram_positions:
+ # Handle words before first n-gram
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ # Calculate entropy for all candidate positions
+ candidate_positions = range(0, first_ngram_start)
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+ for pos in candidate_positions]
+ # Select position with highest entropy
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ # Handle words between n-grams
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ candidate_positions = range(end_prev + 1, start_next)
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+ for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ # Handle words after last n-gram
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+ for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ # Map mask indices to original sentence positions and apply masks
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ return " ".join(masked_words)
+
+
+ def calculate_mask_logits(self, masked_sentence):
+ """
+ Calculate logits for masked tokens in the sentence using BERT.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens.
+
+ Returns:
+ dict: Masked token indices and their logits.
+ """
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+ return mask_logits
+
+ def process_sentences(self, sentences, result_dict, method="random"):
+ """
+ Process sentences and calculate logits for masked tokens.
+
+ Args:
+ sentences (list): List of sentences
+ result_dict (dict): Dictionary of common n-grams
+ method (str): Masking method ("random" or "entropy")
+
+ Returns:
+ dict: Masked sentences and logits for each sentence
+ """
+ results = {}
+
+ for sentence, ngrams in result_dict.items():
+ if method == "random":
+ masked_sentence = self.mask_sentence_random(sentence, ngrams)
+ elif method == "pseudorandom":
+ masked_sentence = self.mask_sentence_pseudorandom(sentence, ngrams)
+ else: # entropy
+ masked_sentence = self.mask_sentence_entropy(sentence, ngrams)
+
+ logits = self.calculate_mask_logits(masked_sentence)
+ results[sentence] = {
+ "masked_sentence": masked_sentence,
+ "mask_logits": logits
+ }
+
+ return results
+
+
+
+if __name__ == "__main__":
+ # !!! Working both the cases regardless if the stopword is removed or not
+ sentences = [
+ "The quick brown fox jumps over the lazy dog everyday.",
+ # "A speedy brown fox jumps over a lazy dog.",
+ # "A swift brown fox leaps over the lethargic dog."
+ ]
+ result_dict ={
+ 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+ }
+
+
+ processor = MaskingProcessor()
+ # results_random = processor.process_sentences(sentences, result_dict)
+ results_entropy = processor.process_sentences(sentences, result_dict, method="random")
+
+ # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+
+ for sentence, output in results_entropy.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {output['masked_sentence']}")
+ # print(f"Mask Logits (Random): {output['mask_logits']}")
+ print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+ print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+ print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+ print('--------------------------------')
+ for mask_idx, logits in output["mask_logits"].items():
+ print(f"Logits for [MASK] at position {mask_idx}:")
+ print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens
+ print(f' len(logits) : {len(logits)}')
+
+
+
+
+# -------------------------------------------------------------------------------------------
+ # def mask_sentence(self, sentence, common_ngrams):
+ # """
+ # Mask words in the sentence based on the specified rules after removing stopwords.
+
+ # Args:
+ # sentence (str): Original sentence.
+ # common_ngrams (dict): Common n-grams and their indices.
+
+ # Returns:
+ # str: Masked sentence.
+ # """
+ # original_words = sentence.split()
+ # print(f' ---- original_words : {original_words} ----- ')
+ # non_stop_words = self.remove_stopwords(original_words)
+ # print(f' ---- non_stop_words : {non_stop_words} ----- ')
+ # adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ # print(f' ---- common_ngrams : {common_ngrams} ----- ')
+ # print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+
+ # mask_indices = []
+
+ # # Extract n-gram positions in non-stop words
+ # ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+ # print(f' ---- ngram_positions : {ngram_positions} ----- ')
+ # # Mask a word before the first common n-gram
+ # if ngram_positions:
+ # first_ngram_start = ngram_positions[0][0]
+ # print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+ # if first_ngram_start > 0:
+ # mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ # print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+ # mask_indices.append(mask_index_before_ngram)
+
+ # # Mask words between common n-grams
+ # for i in range(len(ngram_positions) - 1):
+ # end_prev = ngram_positions[i][1]
+ # print(f' ---- end_prev : {end_prev} ----- ')
+ # start_next = ngram_positions[i + 1][0]
+ # print(f' ---- start_next : {start_next} ----- ')
+ # if start_next > end_prev + 1:
+ # mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ # print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+ # mask_indices.append(mask_index_between_ngrams)
+
+ # # Mask a word after the last common n-gram
+ # last_ngram_end = ngram_positions[-1][1]
+ # print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+ # if last_ngram_end < len(non_stop_words) - 1:
+ # mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ # print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+ # mask_indices.append(mask_index_after_ngram)
+
+ # # Map mask indices back to original sentence
+ # adjusted_indices = [
+ # orig for orig, non_stop in enumerate(original_words)
+ # if non_stop in mask_indices
+ # ]
+
+ # # Apply masks to the original sentence
+ # for idx in adjusted_indices:
+ # original_words[idx] = self.tokenizer.mask_token
+
+ # return " ".join(original_words)
diff --git a/utils/old/masking/masking_methods_ok_working.py b/utils/old/masking/masking_methods_ok_working.py
new file mode 100644
index 0000000000000000000000000000000000000000..59fb09c9a16273344f9f949a13be894403c16ddc
--- /dev/null
+++ b/utils/old/masking/masking_methods_ok_working.py
@@ -0,0 +1,257 @@
+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+
+# Ensure stopwords are downloaded
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ nltk.download('stopwords')
+
+class MaskingProcessor:
+ def __init__(self, ):
+ self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+ self.stop_words = set(stopwords.words('english'))
+
+ def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords):
+ """
+ Adjust indices of common n-grams after removing stop words.
+
+ Args:
+ words (list): List of words in the original sentence.
+ common_ngrams (dict): Common n-grams and their indices.
+
+ Returns:
+ dict: Adjusted common n-grams and their indices.
+ """
+ if not remove_stopwords:
+ return common_ngrams
+
+ non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+ adjusted_ngrams = {}
+
+ for ngram, positions in common_ngrams.items():
+ adjusted_positions = []
+ for start, end in positions:
+ try:
+ new_start = non_stop_word_indices.index(start)
+ new_end = non_stop_word_indices.index(end)
+ adjusted_positions.append((new_start, new_end))
+ except ValueError:
+ continue # Skip if indices cannot be mapped
+ adjusted_ngrams[ngram] = adjusted_positions
+
+ return adjusted_ngrams
+
+ def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+ """
+ Mask one word before the first common n-gram, one between two n-grams,
+ and one after the last common n-gram (random selection).
+
+ Args:
+ original_sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence with original stop words retained
+ """
+ words = original_sentence.split()
+ non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+ adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+
+ mask_indices = []
+ # Handle before the first common n-gram
+ if adjusted_ngrams:
+ first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+ if first_ngram_start > 0:
+ mask_indices.append(random.randint(0, first_ngram_start - 1))
+
+ # Handle between common n-grams
+ ngram_positions = list(adjusted_ngrams.values())
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][-1][1]
+ start_next = ngram_positions[i + 1][0][0]
+ if start_next > end_prev + 1:
+ mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+
+ # Handle after the last common n-gram
+ last_ngram_end = ngram_positions[-1][-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1))
+
+ # Mask the chosen indices
+ original_masked_sentence = words[:]
+ for idx in mask_indices:
+ if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ non_stop_words[idx] = self.tokenizer.mask_token
+ original_masked_sentence[idx] = self.tokenizer.mask_token
+
+ return " ".join(original_masked_sentence)
+
+ def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords):
+ """
+ Mask one word before the first common n-gram, one between two n-grams,
+ and one after the last common n-gram (highest entropy selection).
+
+ Args:
+ original_sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence with original stop words retained
+ """
+ words = original_sentence.split()
+ non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+ adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+ entropy_scores = {}
+
+ for idx, word in enumerate(non_stop_words):
+ if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ continue # Skip words in common n-grams
+
+ masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:]
+ masked_sentence = " ".join(masked_sentence)
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ filtered_logits = logits[0, mask_token_index, :]
+ probs = torch.softmax(filtered_logits, dim=-1)
+ entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item() # Add epsilon to prevent log(0)
+ entropy_scores[idx] = entropy
+
+ mask_indices = []
+
+ # Handle before the first common n-gram
+ if adjusted_ngrams:
+ first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+ candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Handle between common n-grams
+ ngram_positions = list(adjusted_ngrams.values())
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][-1][1]
+ start_next = ngram_positions[i + 1][0][0]
+ candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Handle after the last common n-gram
+ last_ngram_end = ngram_positions[-1][-1][1]
+ candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Mask the chosen indices
+ original_masked_sentence = words[:]
+ for idx in mask_indices:
+ non_stop_words[idx] = self.tokenizer.mask_token
+ original_masked_sentence[idx] = self.tokenizer.mask_token
+
+ return " ".join(original_masked_sentence)
+
+ def calculate_mask_logits(self, masked_sentence):
+ """
+ Calculate logits for masked tokens in the sentence using BERT.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens
+
+ Returns:
+ dict: Masked token indices and their logits
+ """
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+ return mask_logits
+
+ def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False):
+ """
+ Process a list of sentences and calculate logits for masked tokens using the specified method.
+
+ Args:
+ original_sentences (list): List of original sentences
+ result_dict (dict): Common n-grams and their indices for each sentence
+ method (str): Masking method ("random" or "entropy")
+
+ Returns:
+ dict: Masked sentences and their logits for each sentence
+ """
+ results = {}
+
+ for sentence, ngrams in result_dict.items():
+ if method == "random":
+ masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords)
+ elif method == "entropy":
+ masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords)
+ else:
+ raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+
+ logits = self.calculate_mask_logits(masked_sentence)
+ results[sentence] = {
+ "masked_sentence": masked_sentence,
+ "mask_logits": logits
+ }
+
+ return results
+
+# Example usage
+if __name__ == "__main__":
+ # !!! Working both the cases regardless if the stopword is removed or not
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A quick brown dog outpaces a lazy fox.",
+ "Quick brown animals leap over lazy obstacles."
+ ]
+
+ result_dict = {
+ "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+ "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+ "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+ }
+
+ # result_dict = {
+ # "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ # "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ # "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+ # }
+
+ processor = MaskingProcessor()
+ results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+ # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+
+ for sentence, output in results_random.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {output['masked_sentence']}")
+ # print(f"Mask Logits (Random): {output['mask_logits']}")
+ print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+ print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+ print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+ print('--------------------------------')
+ for mask_idx, logits in output["mask_logits"].items():
+ print(f"Logits for [MASK] at position {mask_idx}:")
+ print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens
+
+
+
+ # print('--------------------------------')
+ # for sentence, output in results_entropy.items():
+ # print(f"Original Sentence (Entropy): {sentence}")
+ # print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+ # # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+ # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+ # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+ # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
\ No newline at end of file
diff --git a/utils/old/masking/masking_methods_v1_working.py b/utils/old/masking/masking_methods_v1_working.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b4efc66e6188eb743db8fca94391f0ad212ec9
--- /dev/null
+++ b/utils/old/masking/masking_methods_v1_working.py
@@ -0,0 +1,233 @@
+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+
+ # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS
+ # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+# Ensure stopwords are downloaded
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ nltk.download('stopwords')
+
+class MaskingProcessor:
+ def __init__(self):
+ self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+ self.stop_words = set(stopwords.words('english'))
+
+ def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords=False):
+ """
+ Mask one word before the first common n-gram, one between two n-grams,
+ and one after the last common n-gram (random selection).
+
+ Args:
+ original_sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence
+ """
+ if remove_stopwords:
+ words = original_sentence.split()
+ words = [word for word in words if word not in self.stop_words]
+ else:
+ words = original_sentence.split()
+
+ mask_indices = []
+ # Handle before the first common n-gram
+ if common_ngrams:
+ first_ngram_start = list(common_ngrams.values())[0][0][0]
+ if first_ngram_start > 0:
+ mask_indices.append(random.randint(0, first_ngram_start - 1))
+
+ # Handle between common n-grams
+ ngram_positions = list(common_ngrams.values())
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][-1][1]
+ start_next = ngram_positions[i + 1][0][0]
+ if start_next > end_prev + 1:
+ mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+
+ # Handle after the last common n-gram
+ last_ngram_end = ngram_positions[-1][-1][1]
+ if last_ngram_end < len(words) - 1:
+ mask_indices.append(random.randint(last_ngram_end + 1, len(words) - 1))
+
+ # Mask the chosen indices
+ for idx in mask_indices:
+ if idx not in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ words[idx] = self.tokenizer.mask_token
+
+ return " ".join(words)
+
+ def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords=False):
+ """
+ Mask one word before the first common n-gram, one between two n-grams,
+ and one after the last common n-gram (highest entropy selection).
+
+ Args:
+ original_sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence
+ """
+ if remove_stopwords:
+ words = original_sentence.split()
+ words = [word for word in words if word not in self.stop_words]
+ else:
+ words = original_sentence.split()
+ entropy_scores = {}
+
+ for idx, word in enumerate(words):
+ if idx in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+ continue # Skip words in common n-grams
+
+ masked_sentence = words[:idx] + [self.tokenizer.mask_token] + words[idx + 1:]
+ masked_sentence = " ".join(masked_sentence)
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ filtered_logits = logits[0, mask_token_index, :]
+ probs = torch.softmax(filtered_logits, dim=-1)
+ entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item() # Add epsilon to prevent log(0)
+ entropy_scores[idx] = entropy
+
+ mask_indices = []
+
+ # Handle before the first common n-gram
+ if common_ngrams:
+ first_ngram_start = list(common_ngrams.values())[0][0][0]
+ candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Handle between common n-grams
+ ngram_positions = list(common_ngrams.values())
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][-1][1]
+ start_next = ngram_positions[i + 1][0][0]
+ candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Handle after the last common n-gram
+ last_ngram_end = ngram_positions[-1][-1][1]
+ candidates = [i for i in range(last_ngram_end + 1, len(words)) if i in entropy_scores]
+ if candidates:
+ mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+
+ # Mask the chosen indices
+ for idx in mask_indices:
+ words[idx] = self.tokenizer.mask_token
+
+ return " ".join(words)
+
+ def calculate_mask_logits(self, masked_sentence):
+ """
+ Calculate logits for masked tokens in the sentence using BERT.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens
+
+ Returns:
+ dict: Masked token indices and their logits
+ """
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+ return mask_logits
+
+ def process_sentences(self, original_sentences, result_dict, remove_stopwords=False, method="random"):
+ """
+ Process a list of sentences and calculate logits for masked tokens using the specified method.
+
+ Args:
+ original_sentences (list): List of original sentences
+ result_dict (dict): Common n-grams and their indices for each sentence
+ method (str): Masking method ("random" or "entropy")
+
+ Returns:
+ dict: Masked sentences and their logits for each sentence
+ """
+ results = {}
+
+ for sentence, ngrams in result_dict.items():
+ if method == "random":
+ masked_sentence = self.mask_sentence_random(sentence, ngrams)
+ elif method == "entropy":
+ masked_sentence = self.mask_sentence_entropy(sentence, ngrams)
+ else:
+ raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+
+ logits = self.calculate_mask_logits(masked_sentence)
+ results[sentence] = {
+ "masked_sentence": masked_sentence,
+ "mask_logits": logits
+ }
+
+ return results
+
+# Example usage
+if __name__ == "__main__":
+ # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS
+
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A quick brown dog outpaces a lazy fox.",
+ "Quick brown animals leap over lazy obstacles."
+ ]
+
+ result_dict = {
+ "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+ "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+ "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+ }
+
+ # result_dict = {
+ # "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ # "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ # "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+ # }
+
+ processor = MaskingProcessor()
+ results_random = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="random")
+ results_entropy = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="entropy")
+
+ for sentence, output in results_random.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {output['masked_sentence']}")
+ # print(f"Mask Logits (Random): {output['mask_logits']}")
+
+ for sentence, output in results_entropy.items():
+ print(f"Original Sentence (Entropy): {sentence}")
+ print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+ # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+
+
+
+
+'''
+ result_dict = {
+ "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+ "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+ }
+
+'''
\ No newline at end of file
diff --git a/utils/old/masking_methods_final_copy.py b/utils/old/masking_methods_final_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54f32c6de9c0c33da175c60beb376d632afaf6f
--- /dev/null
+++ b/utils/old/masking_methods_final_copy.py
@@ -0,0 +1,619 @@
+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+from transformers import RobertaTokenizer, RobertaForMaskedLM
+
+
+# Ensure stopwords are downloaded
+try:
+ nltk.data.find('corpora/stopwords')
+except LookupError:
+ nltk.download('stopwords')
+
+class MaskingProcessor:
+ # def __init__(self, tokenizer, model):
+ def __init__(self):
+ # self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ # self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+
+ # self.tokenizer = tokenizer
+ # self.model = model
+
+ self.tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+ self.model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+
+ # self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+ # self.model = RobertaForMaskedLM.from_pretrained("roberta-base")
+
+ self.stop_words = set(stopwords.words('english'))
+
+ def remove_stopwords(self, words):
+ """
+ Remove stopwords from the given list of words.
+
+ Args:
+ words (list): List of words.
+
+ Returns:
+ list: List of non-stop words.
+ """
+ return [word for word in words if word.lower() not in self.stop_words]
+
+ def adjust_ngram_indices(self, original_words, common_ngrams):
+ """
+ Adjust indices of common n-grams after removing stopwords.
+
+ Args:
+ original_words (list): Original list of words.
+ common_ngrams (dict): Common n-grams and their indices.
+
+ Returns:
+ dict: Adjusted common n-grams with updated indices.
+ """
+ non_stop_words = self.remove_stopwords(original_words)
+ original_to_non_stop = []
+ non_stop_idx = 0
+
+ for original_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ original_to_non_stop.append((original_idx, non_stop_idx))
+ non_stop_idx += 1
+
+ adjusted_ngrams = {}
+ for ngram, positions in common_ngrams.items():
+ adjusted_positions = []
+ for start, end in positions:
+ try:
+ new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+ new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+ adjusted_positions.append((new_start, new_end))
+ except StopIteration:
+ continue # Skip if indices cannot be mapped
+ adjusted_ngrams[ngram] = adjusted_positions
+
+ return adjusted_ngrams
+
+ def mask_sentence_random(self, sentence, common_ngrams):
+ """
+ Mask words in the sentence based on the specified rules after removing stopwords.
+ """
+ # Split sentence into words
+ original_words = sentence.split()
+
+ # Handle punctuation at the end
+ has_punctuation = False
+ punctuation = None
+ if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ has_punctuation = True
+ punctuation = original_words[-1][-1]
+ original_words = original_words[:-1]
+
+ print(f' ---- original_words : {original_words} ----- ')
+
+ # Process words without punctuation
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+
+ # Rest of the existing function code...
+ mask_indices = []
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+
+ if ngram_positions:
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ mask_indices.append(mask_index_before_ngram)
+
+ # Mask words between common n-grams
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ mask_indices.append(mask_index_between_ngrams)
+
+ # Mask a word after the last common n-gram
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ mask_indices.append(mask_index_after_ngram)
+
+ # Create mapping from non-stop words to original indices
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ # Map mask indices and apply masks
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+ # masked_words[idx] = '' # for roberta
+
+ # Add back punctuation if it existed
+ if has_punctuation:
+ masked_words.append(punctuation)
+
+ print(f' ***** masked_words at end : {masked_words} ***** ')
+ print(f' ***** original_mask_indices : {original_mask_indices} ***** ')
+ print(f' ***** TESTING : {" ".join(masked_words)} ***** ')
+
+ return " ".join(masked_words), original_mask_indices
+
+ def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+ """
+ Mask words in the sentence based on the specified rules after removing stopwords.
+ """
+ # Split sentence into words
+ random.seed(3)
+ original_words = sentence.split()
+
+ # Handle punctuation at the end
+ has_punctuation = False
+ punctuation = None
+ if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ has_punctuation = True
+ punctuation = original_words[-1][-1]
+ original_words = original_words[:-1]
+
+ print(f' ---- original_words : {original_words} ----- ')
+
+ # Process words without punctuation
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+
+ # Rest of the existing function code...
+ mask_indices = []
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+
+ if ngram_positions:
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ mask_indices.append(mask_index_before_ngram)
+
+ # Mask words between common n-grams
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ mask_indices.append(mask_index_between_ngrams)
+
+ # Mask a word after the last common n-gram
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ mask_indices.append(mask_index_after_ngram)
+
+ # Create mapping from non-stop words to original indices
+ non_stop_to_original = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ non_stop_idx += 1
+
+ # Map mask indices and apply masks
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+ # masked_words[idx] = '' # for roberta
+
+ # Add back punctuation if it existed
+ if has_punctuation:
+ masked_words.append(punctuation)
+
+ print(f' ***** masked_words at end : {masked_words} ***** ')
+ print(f' ***** original_mask_indices : {original_mask_indices} ***** ')
+ print(f' ***** TESTING : {" ".join(masked_words)} ***** ')
+
+ return " ".join(masked_words), original_mask_indices
+
+
+ def calculate_word_entropy(self, sentence, word_position):
+ """
+ Calculate entropy for a specific word position in the sentence.
+
+ Args:
+ sentence (str): The input sentence
+ word_position (int): Position of the word to calculate entropy for
+
+ Returns:
+ float: Entropy value for the word
+ """
+ words = sentence.split()
+ masked_words = words.copy()
+ masked_words[word_position] = self.tokenizer.mask_token
+ masked_sentence = " ".join(masked_words)
+
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ # Get probabilities for the masked position
+ probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+ # Calculate entropy: -sum(p * log(p))
+ entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+
+ return entropy.item()
+
+ def mask_sentence_entropy(self, sentence, common_ngrams):
+ """
+ Mask words in the sentence based on entropy, following n-gram positioning rules.
+
+ Args:
+ sentence (str): Original sentence
+ common_ngrams (dict): Common n-grams and their indices
+
+ Returns:
+ str: Masked sentence
+ """
+ # Split sentence into words
+ original_words = sentence.split()
+
+ # Handle punctuation at the end
+ has_punctuation = False
+ punctuation = None
+ if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ has_punctuation = True
+ punctuation = original_words[-1][-1]
+ original_words = original_words[:-1]
+
+ # Process words without punctuation
+ non_stop_words = self.remove_stopwords(original_words)
+ adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+
+ # Create mapping from non-stop words to original indices
+ non_stop_to_original = {}
+ original_to_non_stop = {}
+ non_stop_idx = 0
+ for orig_idx, word in enumerate(original_words):
+ if word.lower() not in self.stop_words:
+ non_stop_to_original[non_stop_idx] = orig_idx
+ original_to_non_stop[orig_idx] = non_stop_idx
+ non_stop_idx += 1
+
+ ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+ mask_indices = []
+
+ if ngram_positions:
+ # Handle words before first n-gram
+ first_ngram_start = ngram_positions[0][0]
+ if first_ngram_start > 0:
+ candidate_positions = range(0, first_ngram_start)
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+ for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ # Handle words between n-grams
+ for i in range(len(ngram_positions) - 1):
+ end_prev = ngram_positions[i][1]
+ start_next = ngram_positions[i + 1][0]
+ if start_next > end_prev + 1:
+ candidate_positions = range(end_prev + 1, start_next)
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+ for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ # Handle words after last n-gram
+ last_ngram_end = ngram_positions[-1][1]
+ if last_ngram_end < len(non_stop_words) - 1:
+ candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+ entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+ for pos in candidate_positions]
+ mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+
+ # Map mask indices to original sentence positions and apply masks
+ original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ masked_words = original_words.copy()
+ for idx in original_mask_indices:
+ masked_words[idx] = self.tokenizer.mask_token
+
+ # Add back punctuation if it existed
+ if has_punctuation:
+ masked_words.append(punctuation)
+
+ return " ".join(masked_words), original_mask_indices
+
+ def calculate_mask_logits(self, original_sentence, original_mask_indices):
+ """
+ Calculate logits for masked tokens in the sentence using BERT.
+
+ Args:
+ original_sentence (str): Original sentence without masks
+ original_mask_indices (list): List of indices to mask
+
+ Returns:
+ dict: Masked token indices and their logits
+ """
+ print('==========================================================================================================')
+ words = original_sentence.split()
+ print(f' ##### calculate_mask_logits >> words : {words} ##### ')
+ mask_logits = {}
+
+ for idx in original_mask_indices:
+ # Create a copy of words and mask the current position
+ print(f' ---- idx : {idx} ----- ')
+ masked_words = words.copy()
+ masked_words[idx] = '[MASK]'
+ # masked_words[idx] = '' # for roberta
+ masked_sentence = " ".join(masked_words)
+ print(f' ---- masked_sentence : {masked_sentence} ----- ')
+
+ # Calculate logits for the current mask
+ input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ # Extract logits for the masked position
+ mask_logits_tensor = logits[0, mask_token_index, :]
+
+ # Get top logits and corresponding tokens
+ top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1) # Get more candidates
+
+ # Convert token IDs to words and filter out subword tokens
+ top_tokens = []
+ top_logits = []
+ seen_words = set() # To keep track of unique words
+
+ for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]):
+ token = self.tokenizer.convert_ids_to_tokens(token_id.item())
+
+ # Skip if it's a subword token (starts with ##)
+ if token.startswith('##'):
+ continue
+
+ # Convert token to proper word
+ word = self.tokenizer.convert_tokens_to_string([token]).strip()
+
+ # Only add if it's a new word and not empty
+ if word and word not in seen_words:
+ seen_words.add(word)
+ top_tokens.append(word)
+ top_logits.append(logit.item())
+
+ # Break if we have 50 unique complete words
+ if len(top_tokens) == 50:
+ break
+
+ # print(f' ---- top_tokens : {top_tokens} ----- ')
+
+ # Store results
+ mask_logits[idx] = {
+ "tokens": top_tokens,
+ "logits": top_logits
+ }
+
+ return mask_logits
+
+ # def calculate_mask_logits(self, original_sentence, original_mask_indices):
+ # """
+ # Calculate logits for masked tokens in the sentence using BERT.
+
+ # Args:
+ # original_sentence (str): Original sentence without masks
+ # original_mask_indices (list): List of indices to mask
+
+ # Returns:
+ # dict: Masked token indices and their logits
+ # """
+ # words = original_sentence.split()
+ # print(f' ##### calculate_mask_logits >> words : {words} ##### ')
+ # mask_logits = {}
+
+ # for idx in original_mask_indices:
+ # # Create a copy of words and mask the current position
+ # print(f' ---- idx : {idx} ----- ')
+ # masked_words = words.copy()
+ # print(f' ---- words : {masked_words} ----- ')
+ # # masked_words[idx] = self.tokenizer.mask_token
+ # masked_words[idx] = '[MASK]'
+ # print(f' ---- masked_words : {masked_words} ----- ')
+ # masked_sentence = " ".join(masked_words)
+ # print(f' ---- masked_sentence : {masked_sentence} ----- ')
+
+ # # Calculate logits for the current mask
+ # input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+ # mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ # with torch.no_grad():
+ # outputs = self.model(input_ids)
+ # logits = outputs.logits
+
+ # # Extract logits for the masked position
+ # mask_logits_tensor = logits[0, mask_token_index, :]
+
+ # # Get top 50 logits and corresponding tokens
+ # top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 50, dim=-1)
+
+ # # Convert token IDs to words
+ # top_tokens = [self.tokenizer.convert_ids_to_tokens(token_id.item()) for token_id in top_mask_indices[0]]
+ # print(f' ---- top_tokens : {top_tokens} ----- ')
+
+ # # Store results
+ # mask_logits[idx] = {
+ # "tokens": top_tokens,
+ # "logits": top_mask_logits.tolist()
+ # }
+
+ # return mask_logits
+
+
+ def process_sentences(self, sentences, result_dict, method="random"):
+ """
+ Process sentences and calculate logits for masked tokens.
+ """
+ results = {}
+
+ for sentence, ngrams in result_dict.items():
+ # Split punctuation from the last word before processing
+ words = sentence.split()
+ last_word = words[-1]
+ if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+ # Split the last word and punctuation
+ words[-1] = last_word[:-1]
+ punctuation = last_word[-1]
+ # Rejoin with space before punctuation to treat it as separate token
+ processed_sentence = " ".join(words) + " " + punctuation
+ else:
+ processed_sentence = sentence
+
+ if method == "random":
+ masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams)
+ elif method == "pseudorandom":
+ masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams)
+ else: # entropy
+ masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams)
+
+ logits = self.calculate_mask_logits(processed_sentence, original_mask_indices)
+ results[sentence] = {
+ "masked_sentence": masked_sentence,
+ "mask_logits": logits
+ }
+
+ return results
+
+
+
+if __name__ == "__main__":
+ # !!! Working both the cases regardless if the stopword is removed or not
+ sentences = [
+ "The quick brown fox jumps over small cat the lazy dog everyday again and again .",
+ # "A speedy brown fox jumps over a lazy dog.",
+ # "A swift brown fox leaps over the lethargic dog."
+
+ ]
+ result_dict ={
+ 'The quick brown fox jumps over small cat the lazy dog everyday again and again .': {'brown fox': [(2, 3)],'cat': [(7, 7)], 'dog': [(10, 10)]},
+ # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+ }
+
+
+ processor = MaskingProcessor()
+ # results_random = processor.process_sentences(sentences, result_dict)
+ results_entropy = processor.process_sentences(sentences, result_dict, method="random")
+
+ '''
+ results structure :
+ results = {
+ "The quick brown fox jumps over the lazy dog everyday.":
+ { # Original sentence as key
+ "masked_sentence": str, # The sentence with [MASK] tokens
+ "mask_logits":
+ { # Dictionary of mask positions and their predictions
+ 1:
+ { # Position of mask in sentence
+ "tokens" (words) : list, # List of top 50 predicted tokens
+ "logits" (probabilities) : list # Corresponding logits for those tokens
+ },
+ 7:
+ {
+ "tokens" (words) : list,
+ "logits" (probabilities) : list
+ },
+ 10:
+ {
+ "tokens (words)": list,
+ "logits (probabilities)": list
+ }
+ }
+ }
+ }
+
+ '''
+ # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+
+ for sentence, output in results_entropy.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {output['masked_sentence']}")
+ # print(f"Mask Logits (Random): {output['mask_logits']}")
+ # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+ # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+ # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+ # print('--------------------------------')
+ # for mask_idx, logits in output["mask_logits"].items():
+ # print(f"Logits for [MASK] at position {mask_idx}:")
+ # print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens
+ # print(f' len(logits) : {len(logits)}')
+
+
+# ------------------------------------------------------------------------------------------------
+ # def mask_sentence_random(self, sentence, common_ngrams):
+ # """
+ # Mask words in the sentence based on the specified rules after removing stopwords.
+ # """
+ # original_words = sentence.split()
+ # # print(f' ---- original_words : {original_words} ----- ')
+ # non_stop_words = self.remove_stopwords(original_words)
+ # # print(f' ---- non_stop_words : {non_stop_words} ----- ')
+ # adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+ # # print(f' ---- common_ngrams : {common_ngrams} ----- ')
+ # # print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+
+ # mask_indices = []
+
+ # # Extract n-gram positions in non-stop words
+ # ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+
+ # # Mask a word before the first common n-gram
+ # if ngram_positions:
+ # # print(f' ---- ngram_positions : {ngram_positions} ----- ')
+ # first_ngram_start = ngram_positions[0][0]
+ # # print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+ # if first_ngram_start > 0:
+ # mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+ # # print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+ # mask_indices.append(mask_index_before_ngram)
+
+ # # Mask words between common n-grams
+ # for i in range(len(ngram_positions) - 1):
+ # end_prev = ngram_positions[i][1]
+ # # print(f' ---- end_prev : {end_prev} ----- ')
+ # start_next = ngram_positions[i + 1][0]
+ # # print(f' ---- start_next : {start_next} ----- ')
+ # if start_next > end_prev + 1:
+ # mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+ # # print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+ # mask_indices.append(mask_index_between_ngrams)
+
+ # # Mask a word after the last common n-gram
+ # last_ngram_end = ngram_positions[-1][1]
+ # if last_ngram_end < len(non_stop_words) - 1:
+ # # print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+ # mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+ # # print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+ # mask_indices.append(mask_index_after_ngram)
+
+ # # Create mapping from non-stop words to original indices
+ # non_stop_to_original = {}
+ # non_stop_idx = 0
+ # for orig_idx, word in enumerate(original_words):
+ # if word.lower() not in self.stop_words:
+ # non_stop_to_original[non_stop_idx] = orig_idx
+ # non_stop_idx += 1
+
+ # # Map mask indices from non-stop word positions to original positions
+ # # print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+ # original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+ # # print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+
+ # # Apply masks to the original sentence
+ # masked_words = original_words.copy()
+ # for idx in original_mask_indices:
+ # masked_words[idx] = self.tokenizer.mask_token
+
+ # return " ".join(masked_words), original_mask_indices
diff --git a/utils/old/non_melting_points_v1.py b/utils/old/non_melting_points_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a6369ecfc79e18cba487f54f59f8e00cdaf58be
--- /dev/null
+++ b/utils/old/non_melting_points_v1.py
@@ -0,0 +1,244 @@
+import nltk
+from nltk.corpus import stopwords
+from nltk.util import ngrams
+from collections import Counter
+import re
+
+class NgramProcessor:
+ def __init__(self):
+ try:
+ nltk.data.find('corpora/stopwords')
+ except LookupError:
+ nltk.download('stopwords')
+
+ self.stop_words = set(stopwords.words('english'))
+
+ def remove_stopwords(self, text):
+ """
+ Remove stopwords using NLTK's stopword list
+
+ Args:
+ text (str): Input text
+
+ Returns:
+ str: Cleaned text with stopwords removed
+ """
+ words = re.findall(r'\w+', text.lower())
+ filtered_words = [word for word in words if word not in self.stop_words]
+ return ' '.join(filtered_words)
+
+ def is_exact_match(self, ngram, sentences):
+ """
+ Check if the given n-gram has an exact match in all sentences
+
+ Args:
+ ngram (str): The n-gram to search for
+ sentences (list): List of sentences to search in
+
+ Returns:
+ bool: True if n-gram has exact match in all sentences, False otherwise
+ """
+ return all(ngram in sentence for sentence in sentences)
+
+ def is_substring_of_any(self, ngram, common_ngrams):
+ """
+ Check if the given n-gram is an exact substring of any previously found common n-grams
+
+ Args:
+ ngram (str): The n-gram to check
+ common_ngrams (list): List of previously found common n-grams
+
+ Returns:
+ bool: True if ngram is a substring of any common_ngrams, False otherwise
+ """
+ return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+
+ def find_filtered_ngrams(self, sentences):
+ """
+ Find all n-grams that have exact matches across all sentences,
+ excluding those that are part of larger common n-grams
+
+ Args:
+ sentences (list): List of sentences to analyze
+
+ Returns:
+ list: List of tuples where each tuple contains the n-gram and its indices in each sentence
+ """
+ original_sentences = sentences[:]
+ sentences = [self.remove_stopwords(sentence) for sentence in sentences]
+ ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram
+ common_ngrams = []
+
+ for n in ngram_lengths:
+ ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
+ ngrams_counter = Counter(ngrams_list[0])
+
+ for ngram in ngrams_counter:
+ ngram_str = ' '.join(ngram)
+ if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]):
+ indices = []
+ for original_sentence in original_sentences:
+ words = original_sentence.split()
+ ngram_indices = [
+ (i, i + n - 1) for i in range(len(words) - n + 1)
+ if ' '.join(words[i:i + n]).lower() == ngram_str
+ ]
+ indices.append(ngram_indices)
+ common_ngrams.append((ngram_str, indices))
+
+ return common_ngrams
+
+ def find_relative_order(self, sentence, common_ngrams):
+ """
+ Find the relative order of the common n-grams in the sentence
+
+ Args:
+ sentence (str): Sentence in which to find the relative order
+ common_ngrams (list): List of common n-grams
+
+ Returns:
+ list: List of tuples with the relative position and the n-gram
+ """
+ relative_order = []
+ for ngram, _ in common_ngrams:
+ index = sentence.find(ngram)
+ if index != -1:
+ relative_order.append((index, ngram))
+
+ return sorted(relative_order)
+
+# Example usage
+if __name__ == "__main__":
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A quick brown dog outpaces a lazy fox.",
+ "Quick brown animals leap over lazy obstacles."
+ ]
+
+ processor = NgramProcessor()
+ common_ngrams = processor.find_filtered_ngrams(sentences)
+ print("Common n-grams and their indices:")
+ for ngram, indices in common_ngrams:
+ print(f"{ngram}: {indices}")
+
+ for sentence in sentences:
+ relative_order = processor.find_relative_order(sentence, common_ngrams)
+ print(f"Relative order in sentence '{sentence}':", relative_order)
+
+
+
+# import nltk
+# from nltk.corpus import stopwords
+# from nltk.util import ngrams
+# from collections import Counter
+# import re
+
+# class NgramProcessor:
+# def __init__(self):
+# try:
+# nltk.data.find('corpora/stopwords')
+# except LookupError:
+# nltk.download('stopwords')
+
+# self.stop_words = set(stopwords.words('english'))
+
+# def remove_stopwords(self, text):
+# """
+# Remove stopwords using NLTK's stopword list
+
+# Args:
+# text (str): Input text
+
+# Returns:
+# str: Cleaned text with stopwords removed
+# """
+# words = re.findall(r'\w+', text.lower())
+# filtered_words = [word for word in words if word not in self.stop_words]
+# return ' '.join(filtered_words)
+
+# def is_exact_match(self, ngram, sentences):
+# """
+# Check if the given n-gram has an exact match in all sentences
+
+# Args:
+# ngram (str): The n-gram to search for
+# sentences (list): List of sentences to search in
+
+# Returns:
+# bool: True if n-gram has exact match in all sentences, False otherwise
+# """
+# return all(ngram in sentence for sentence in sentences)
+
+# def is_substring_of_any(self, ngram, common_ngrams):
+# """
+# Check if the given n-gram is an exact substring of any previously found common n-grams
+
+# Args:
+# ngram (str): The n-gram to check
+# common_ngrams (list): List of previously found common n-grams
+
+# Returns:
+# bool: True if ngram is a substring of any common_ngrams, False otherwise
+# """
+# return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+
+# def find_filtered_ngrams(self, sentences):
+# """
+# Find all n-grams that have exact matches across all sentences,
+# excluding those that are part of larger common n-grams
+
+# Args:
+# sentences (list): List of sentences to analyze
+
+# Returns:
+# list: List of all common n-grams in order of their appearance in the first sentence
+# """
+# sentences = [self.remove_stopwords(sentence) for sentence in sentences]
+# ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram
+# common_ngrams = []
+
+# for n in ngram_lengths:
+# ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
+# ngrams_counter = Counter(ngrams_list[0])
+
+# for ngram in ngrams_counter:
+# ngram_str = ' '.join(ngram)
+# if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams):
+# common_ngrams.append(ngram_str)
+
+# return common_ngrams
+
+# def find_relative_order(self, sentence, common_ngrams):
+# """
+# Find the relative order of the common n-grams in the sentence
+
+# Args:
+# sentence (str): Sentence in which to find the relative order
+# common_ngrams (list): List of common n-grams
+
+# Returns:
+# list: List of tuples with the relative position and the n-gram
+# """
+# relative_order = []
+# for ngram in common_ngrams:
+# index = sentence.find(ngram)
+# if index != -1:
+# relative_order.append((index, ngram))
+
+# return sorted(relative_order)
+
+# # Example usage
+# if __name__ == "__main__":
+# sentences = [
+# "The quick brown fox jumps over the lazy dog.",
+# "A quick brown dog outpaces a lazy fox.",
+# "Quick brown animals leap over lazy obstacles."
+# ]
+
+# processor = NgramProcessor()
+# common_ngrams = processor.find_filtered_ngrams(sentences)
+# print("Common n-grams:", common_ngrams)
+
+# for sentence in sentences:
+# relative_order = processor.find_relative_order(sentence, common_ngrams)
+# print(f"Relative order in sentence '{sentence}':", relative_order)
diff --git a/utils/old/sampling/sampling.py b/utils/old/sampling/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0a58912beeb4d15d0c982d790d22423594c359
--- /dev/null
+++ b/utils/old/sampling/sampling.py
@@ -0,0 +1,330 @@
+import torch
+import random
+from masking_methods import MaskingProcessor
+import nltk
+from nltk.corpus import words
+import torch.nn.functional as F
+
+
+class SamplingProcessor:
+ def __init__(self, tokenizer):
+ """
+ Initialize the SamplingProcessor.
+
+ Args:
+ tokenizer: BERT tokenizer instance
+ """
+ self.tokenizer = tokenizer
+ self.subtoken_prefix = self._get_subtoken_prefix()
+ self.subtoken_ids = self._get_subtoken_ids()
+ try:
+ nltk.data.find('corpora/words')
+ except LookupError:
+ nltk.download('words')
+ self.english_words = set(words.words())
+
+ # def _get_subtoken_prefix(self):
+ # """
+ # Identify the subtoken prefix based on the tokenizer.
+
+ # Returns:
+ # str: The prefix used for subtokens (e.g., "##" for BERT).
+ # """
+ # # This method assumes that the tokenizer uses a consistent subtoken prefix.
+ # # Adjust accordingly if using different tokenizers.
+ # # For BERT's WordPiece tokenizer:
+ # if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs:
+ # return self.tokenizer.init_kwargs["wordpiece_prefix"]
+ # elif hasattr(self.tokenizer, "prefix_tokens"):
+ # return self.tokenizer.prefix_tokens
+ # else:
+ # # Default to BERT's subtoken prefix
+ # return "##"
+
+ def _get_subtoken_prefix(self):
+ """
+ Identify the subtoken prefix based on the tokenizer.
+
+ Returns:
+ str: The prefix used for subtokens (e.g., "##" for BERT).
+ """
+ # This method assumes that the tokenizer uses a consistent subtoken prefix.
+ # Adjust accordingly if using different tokenizers.
+ # For BERT's WordPiece tokenizer:
+ if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs:
+ return self.tokenizer.init_kwargs["wordpiece_prefix"]
+ elif hasattr(self.tokenizer, "prefix_tokens"):
+ return self.tokenizer.prefix_tokens
+ else:
+ # Default to BERT's subtoken prefix
+ return "##"
+
+
+ # def _get_subtoken_ids(self):
+ # """
+ # Retrieve all token IDs that correspond to subtokens.
+
+ # Returns:
+ # set: A set of subtoken IDs.
+ # """
+ # vocab = self.tokenizer.get_vocab()
+ # subtoken_ids = set()
+ # for token, idx in vocab.items():
+ # if token.startswith(self.subtoken_prefix):
+ # subtoken_ids.add(idx)
+ # return subtoken_ids
+
+ def _get_subtoken_ids(self):
+ """
+ Retrieve all token IDs that correspond to subtokens.
+
+ Returns:
+ list: A list of subtoken IDs.
+ """
+ vocab = self.tokenizer.get_vocab()
+ subtoken_ids = []
+ for token, idx in vocab.items():
+ if token.startswith(self.subtoken_prefix):
+ subtoken_ids.append(idx)
+ return subtoken_ids # Changed from set to list
+
+
+ def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+ tokens = self.tokenizer.tokenize(masked_sentence)
+
+ for mask_pos in sorted(mask_logits_dict.keys()):
+ try:
+ # Get logits and squeeze extra dimension
+ mask_logits = torch.tensor(mask_logits_dict[mask_pos]).squeeze(0) # Remove the extra dimension
+
+ # Create a mask for valid tokens (no special tokens, no subwords)
+ valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool)
+ for idx in range(len(mask_logits)):
+ token = self.tokenizer.convert_ids_to_tokens([idx])[0]
+ # Only allow regular words (no special tokens, no subwords)
+ if token.isalpha() and not token.startswith('[') and not token.startswith('##'):
+ valid_mask[idx] = True
+
+ # Get valid logits
+ valid_logits = mask_logits[valid_mask]
+ valid_indices = torch.where(valid_mask)[0]
+
+ if len(valid_logits) == 0:
+ print(f"Warning: No valid tokens found for position {mask_pos}")
+ continue
+
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(valid_logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item()
+ sampled_index = valid_indices[sampled_idx].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(valid_logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_idx = torch.argmax(random_probs * exp_probs).item()
+ sampled_index = valid_indices[sampled_idx].item()
+
+ elif sampling_technique == "temperature":
+ valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8)
+ probs = torch.softmax(valid_logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8))
+ probs = probs / torch.sum(probs)
+ sampled_idx = torch.multinomial(probs, 1)[0].item()
+ sampled_index = valid_indices[sampled_idx].item()
+
+ elif sampling_technique == 'greedy':
+ sampled_idx = torch.argmax(valid_logits).item()
+ sampled_index = valid_indices[sampled_idx].item()
+
+ # Replace mask with sampled token
+ sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+ tokens[mask_pos] = sampled_token
+
+ except Exception as e:
+ print(f"Error sampling for position {mask_pos}: {str(e)}")
+ continue
+
+ return self.tokenizer.convert_tokens_to_string(tokens)
+
+
+
+ def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+ """
+ Process all masked sentences in the results dictionary.
+
+ Args:
+ results_dict (dict): Dictionary containing masked sentences and their logits
+ sampling_technique (str): Sampling method to use
+ temperature (float): Temperature parameter for sampling
+
+ Returns:
+ dict: Dictionary containing original, masked, and sampled sentences
+ """
+ processed_results = {}
+
+ for original_sentence, data in results_dict.items():
+ masked_sentence = data["masked_sentence"]
+ mask_logits = data["mask_logits"]
+
+ sampled_sentence = self.sample_tokens(
+ mask_logits,
+ masked_sentence,
+ sampling_technique,
+ temperature
+ )
+
+ processed_results[original_sentence] = {
+ "masked_sentence": masked_sentence,
+ "sampled_sentence": sampled_sentence
+ }
+
+ return processed_results
+
+if __name__ == "__main__":
+ sentences = [
+ "The quick brown fox jumps over the lazy dog everyday.",
+ ]
+ result_dict = {
+ 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ }
+
+ # First, mask the sentences
+ masking_processor = MaskingProcessor()
+ masking_results = masking_processor.process_sentences(sentences, result_dict)
+
+ # Then, sample replacements for the masks
+ sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+
+ # Try different sampling techniques
+ sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+
+ for technique in sampling_techniques:
+ print(f"\nSampling using {technique}:")
+ sampled_results = sampling_processor.process_masked_sentences(
+ masking_results,
+ sampling_technique=technique,
+ temperature=1.0
+ )
+
+ for original_sentence, result in sampled_results.items():
+ print(f"Original: {original_sentence}")
+ print(f"Masked: {result['masked_sentence']}")
+ print(f"Sampled: {result['sampled_sentence']}")
+ print("---")
+
+# --------------------------------------------------------------------------------------------------
+ # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100):
+ # words = masked_sentence.split()
+ # mask_positions = sorted(mask_logits_dict.keys())
+
+ # for mask_pos in mask_positions:
+ # mask_logits = torch.tensor(mask_logits_dict[mask_pos])
+
+ # try:
+ # if sampling_technique == "inverse_transform":
+ # probs = torch.softmax(mask_logits / temperature, dim=-1)
+ # cumulative_probs = torch.cumsum(probs, dim=-1)
+ # random_prob = random.random()
+ # sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ # elif sampling_technique == "exponential_minimum":
+ # probs = torch.softmax(mask_logits / temperature, dim=-1)
+ # exp_probs = torch.exp(-torch.log(probs))
+ # random_probs = torch.rand_like(exp_probs)
+ # sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ # elif sampling_technique == "temperature":
+ # mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+ # probs = torch.softmax(mask_logits / temperature, dim=-1)
+ # if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ # raise ValueError("The computed probabilities contain NaN or inf values.")
+ # probs = torch.max(probs, torch.tensor(1e-8))
+ # probs = probs / torch.sum(probs)
+ # sampled_index = torch.multinomial(probs, 1)[0].item()
+
+ # elif sampling_technique == 'greedy':
+ # sampled_index = torch.argmax(mask_logits).item()
+
+ # else:
+ # raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # # Replace mask with sampled token
+ # sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+ # words[mask_pos] = sampled_token
+
+ # except Exception as e:
+ # print(f"Error sampling for position {mask_pos}: {str(e)}")
+ # continue
+
+ # return " ".join(words)
+
+ ## MORE WEIRD RESULTS
+ # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100):
+ # words = masked_sentence.split()
+ # mask_positions = sorted(mask_logits_dict.keys())
+
+ # for mask_pos in mask_positions:
+ # mask_logits = torch.tensor(mask_logits_dict[mask_pos])
+
+ # try:
+ # # Create a mask for valid tokens (no special tokens, no subwords)
+ # valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool)
+ # for idx in range(len(mask_logits)):
+ # token = self.tokenizer.convert_ids_to_tokens([idx])[0]
+ # # Only allow regular words (no special tokens, no subwords)
+ # if token.isalpha() and not token.startswith('[') and not token.startswith('##'):
+ # valid_mask[idx] = True
+
+ # # Get valid logits
+ # valid_logits = mask_logits[valid_mask]
+ # valid_indices = torch.where(valid_mask)[0]
+
+ # if len(valid_logits) == 0:
+ # print(f"Warning: No valid tokens found for position {mask_pos}")
+ # continue
+
+ # if sampling_technique == "inverse_transform":
+ # probs = torch.softmax(valid_logits / temperature, dim=-1)
+ # cumulative_probs = torch.cumsum(probs, dim=-1)
+ # random_prob = random.random()
+ # sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item()
+ # sampled_index = valid_indices[sampled_idx].item()
+
+ # elif sampling_technique == "exponential_minimum":
+ # probs = torch.softmax(valid_logits / temperature, dim=-1)
+ # exp_probs = torch.exp(-torch.log(probs))
+ # random_probs = torch.rand_like(exp_probs)
+ # sampled_idx = torch.argmax(random_probs * exp_probs).item()
+ # sampled_index = valid_indices[sampled_idx].item()
+
+ # elif sampling_technique == "temperature":
+ # valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8)
+ # probs = torch.softmax(valid_logits / temperature, dim=-1)
+ # if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ # raise ValueError("The computed probabilities contain NaN or inf values.")
+ # probs = torch.max(probs, torch.tensor(1e-8))
+ # probs = probs / torch.sum(probs)
+ # sampled_idx = torch.multinomial(probs, 1)[0].item()
+ # sampled_index = valid_indices[sampled_idx].item()
+
+ # elif sampling_technique == 'greedy':
+ # sampled_idx = torch.argmax(valid_logits).item()
+ # sampled_index = valid_indices[sampled_idx].item()
+
+ # else:
+ # raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # # Replace mask with sampled token
+ # sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+ # words[mask_pos] = sampled_token
+
+ # except Exception as e:
+ # print(f"Error sampling for position {mask_pos}: {str(e)}")
+ # continue
+
+ # return " ".join(words)
\ No newline at end of file
diff --git a/utils/old/sampling/sampling_methods.py b/utils/old/sampling/sampling_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1e2c8cd22a8edfa8116e1274c7dfbdc3c64bc3
--- /dev/null
+++ b/utils/old/sampling/sampling_methods.py
@@ -0,0 +1,291 @@
+from transformers import BertTokenizer, BertForMaskedLM
+import torch
+import random
+from masking_methods import MaskingProcessor
+from transformers import pipeline
+
+class SamplingProcessorWithModel:
+ def __init__(self, model_name='bert-base-uncased'):
+ self.tokenizer = BertTokenizer.from_pretrained(model_name)
+ self.model = BertForMaskedLM.from_pretrained(model_name)
+ self.model.eval() # Set the model to evaluation mode
+
+ def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+ """
+ Fills each mask in the masked sentence using the specified sampling technique.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens.
+ sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+ temperature (float): Temperature parameter for sampling methods.
+
+ Returns:
+ str: Sentence with the masks filled.
+ """
+ input_ids = self.tokenizer.encode(masked_sentence, return_tensors="pt")
+
+ while self.tokenizer.mask_token_id in input_ids[0]:
+ # Find indices of all [MASK] tokens
+ mask_indices = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+
+ # Process the first [MASK] token in the sequence
+ mask_index = mask_indices[0].item()
+
+ # Get logits from the model
+ with torch.no_grad():
+ outputs = self.model(input_ids)
+ logits = outputs.logits
+
+ # Extract logits for the [MASK] token
+ mask_logits = logits[0, mask_index]
+
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ elif sampling_technique == "temperature":
+ mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8, device=mask_logits.device))
+ probs = probs / torch.sum(probs)
+ probs = probs.flatten()
+ if probs.size(0) > 1:
+ sampled_index = torch.multinomial(probs, 1).item()
+ else:
+ sampled_index = torch.argmax(probs).item()
+
+ elif sampling_technique == 'greedy':
+ sampled_index = torch.argmax(mask_logits).item()
+
+ else:
+ raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # Replace the first [MASK] with the selected token
+ input_ids[0, mask_index] = sampled_index
+
+ return self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+
+ def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+ """
+ Fills each mask in the masked sentence using the specified sampling technique.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens.
+ sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+ temperature (float): Temperature parameter for sampling methods.
+
+ Returns:
+ str: Sentence with the masks filled.
+ """
+ while '[MASK]' in masked_sentence:
+ # Get predictions for the first [MASK]
+ predictions = self.unmasker(masked_sentence)
+
+ # Ensure predictions is a list of dictionaries
+ if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+ raise ValueError("Unexpected structure in predictions from the pipeline.")
+
+ # Extract logits (scores) from the predictions
+ logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ elif sampling_technique == "temperature":
+ logits = torch.clamp(logits, min=-1e8, max=1e8)
+ probs = torch.softmax(logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+ probs = probs / torch.sum(probs)
+ probs = probs.flatten()
+ if probs.size(0) > 1:
+ sampled_index = torch.multinomial(probs, 1).item()
+ else:
+ sampled_index = torch.argmax(probs).item()
+
+ elif sampling_technique == 'greedy':
+ sampled_index = torch.argmax(logits).item()
+
+ else:
+ raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # Replace the first [MASK] with the selected word
+ sampled_token = predictions[sampled_index]['token_str']
+ masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+
+ return masked_sentence
+
+
+# Example usage
+if __name__ == "__main__":
+ from transformers import BertTokenizer
+
+ # Define sentences and result_dict
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A quick brown dog outpaces a lazy fox.",
+ "Quick brown dog leaps over lazy the fox."
+ ]
+ result_dict = {
+ "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+ "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+ "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+ }
+
+ masking_processor = MaskingProcessor()
+ masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+
+ # Use SamplingProcessor
+ sampling_processor = SamplingProcessorWithModel()
+
+ # Iterate through masking results to apply sampling
+ for sentence, result in masking_results.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {result['masked_sentence']}")
+ masked_sentence = result["masked_sentence"]
+
+ # Apply different sampling techniques
+ for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+ print(f"Sampling Technique: {technique}")
+ filled_sentence = sampling_processor.fill_masked_sentence(
+ masked_sentence=masked_sentence,
+ sampling_technique=technique,
+ temperature=1.0 # Adjust temperature as needed
+ )
+ print(f"Filled Sentence: {filled_sentence}\n")
+ print('--------------------------------')
+
+
+
+# from transformers import pipeline
+# import torch
+# import random
+# from masking_methods import MaskingProcessor
+
+
+# class SamplingProcessorWithPipeline:
+# def __init__(self, model_name='bert-base-uncased'):
+# self.unmasker = pipeline('fill-mask', model=model_name)
+# self.tokenizer = self.unmasker.tokenizer
+
+# def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+# """
+# Fills each mask in the masked sentence using the specified sampling technique.
+
+# Args:
+# masked_sentence (str): Sentence with [MASK] tokens.
+# sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+# temperature (float): Temperature parameter for sampling methods.
+
+# Returns:
+# str: Sentence with the masks filled.
+# """
+# while '[MASK]' in masked_sentence:
+# # Get predictions for the first [MASK]
+# predictions = self.unmasker(masked_sentence)
+# print(f' predictions : {predictions}')
+# print(f' type of predictions : {type(predictions)}')
+
+# # Ensure predictions is a list of dictionaries for the first [MASK]
+# if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+# raise ValueError("Unexpected structure in predictions from the pipeline.")
+
+# # Extract logits (scores) from the predictions
+# logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+
+# if sampling_technique == "inverse_transform":
+# probs = torch.softmax(logits / temperature, dim=-1)
+# cumulative_probs = torch.cumsum(probs, dim=-1)
+# random_prob = random.random()
+# sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+# elif sampling_technique == "exponential_minimum":
+# probs = torch.softmax(logits / temperature, dim=-1)
+# exp_probs = torch.exp(-torch.log(probs))
+# random_probs = torch.rand_like(exp_probs)
+# sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+# elif sampling_technique == "temperature":
+# logits = torch.clamp(logits, min=-1e8, max=1e8)
+# probs = torch.softmax(logits / temperature, dim=-1)
+# if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+# raise ValueError("The computed probabilities contain NaN or inf values.")
+# probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+# probs = probs / torch.sum(probs)
+# probs = probs.flatten()
+# if probs.size(0) > 1:
+# sampled_index = torch.multinomial(probs, 1).item()
+# else:
+# sampled_index = torch.argmax(probs).item()
+
+# elif sampling_technique == 'greedy':
+# sampled_index = torch.argmax(logits).item()
+
+# else:
+# raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+# # Replace the first [MASK] with the selected word
+# sampled_token = predictions[sampled_index]['token_str']
+# masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+
+# return masked_sentence
+
+
+# # Example usage
+# if __name__ == "__main__":
+# from transformers import BertTokenizer
+
+# # Define sentences and result_dict
+# sentences = [
+# "The quick brown fox jumps over the lazy dog.",
+# "A quick brown dog outpaces a lazy fox.",
+# "Quick brown animals leap over lazy obstacles."
+# ]
+# result_dict = {
+# "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+# "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+# "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+# }
+
+# masking_processor = MaskingProcessor()
+# masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+
+# # Use SamplingProcessor
+# sampling_processor = SamplingProcessorWithPipeline()
+
+# # Iterate through masking results to apply sampling
+# for sentence, result in masking_results.items():
+# print(f"Original Sentence (Random): {sentence}")
+# print(f"Masked Sentence (Random): {result['masked_sentence']}")
+# masked_sentence = result["masked_sentence"]
+
+# # Apply different sampling techniques
+# for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+# print(f"Sampling Technique: {technique}")
+# filled_sentence = sampling_processor.fill_masked_sentence(
+# masked_sentence=masked_sentence,
+# sampling_technique=technique,
+# temperature=1.0 # Adjust temperature as needed
+# )
+# print(f"Filled Sentence: {filled_sentence}\n")
+# print('--------------------------------')
diff --git a/utils/old/sampling/sampling_methods_v1.py b/utils/old/sampling/sampling_methods_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4b907c6c54dac0ac293d58e5c234d3fb7f34fc4
--- /dev/null
+++ b/utils/old/sampling/sampling_methods_v1.py
@@ -0,0 +1,146 @@
+import torch
+import random
+from masking_methods import MaskingProcessor
+
+class SamplingProcessor:
+ def __init__(self, tokenizer):
+ self.tokenizer = tokenizer
+
+ def fill_masked_sentence(self, original_sentence, mask_logits, sampling_technique, temperature=1.0):
+ """
+ Fills each mask in the masked sentence using the specified sampling technique.
+
+ Args:
+ original_sentence (str): The original masked sentence.
+ mask_logits (dict): Logits for each [MASK] token.
+ sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+ temperature (float): Temperature parameter for sampling methods.
+
+ Returns:
+ str: Sentence with the masks filled.
+ """
+ sentence_tokens = self.tokenizer.tokenize(original_sentence)
+ mask_token_indices = [i for i, token in enumerate(sentence_tokens) if token == self.tokenizer.mask_token]
+
+ if len(mask_token_indices) != len(mask_logits):
+ raise ValueError("Mismatch between number of [MASK] tokens and logits provided.")
+
+ for mask_idx, filtered_logits in zip(mask_token_indices, mask_logits.values()):
+ # Convert logits to a tensor
+ filtered_logits = torch.tensor(filtered_logits)
+ # filtered_logits, _ = torch.sort(filtered_logits, descending=True)
+ # print(f' type of filtered_logits : {type(filtered_logits)}')
+ # filtered_logits = filtered_logits[:5]
+
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ elif sampling_technique == "temperature":
+ filtered_logits = torch.clamp(filtered_logits, min=-1e8, max=1e8)
+ probs = torch.softmax(filtered_logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8, device=filtered_logits.device))
+ probs = probs / torch.sum(probs)
+ probs = probs.flatten()
+ if probs.size(0) > 1:
+ sampled_index = torch.multinomial(probs, 1).item()
+ else:
+ sampled_index = torch.argmax(probs).item()
+
+ elif sampling_technique == 'greedy':
+ sampled_index = torch.argmax(filtered_logits).item()
+
+ else:
+ raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+ sentence_tokens[mask_idx] = sampled_token
+
+ return self.tokenizer.convert_tokens_to_string(sentence_tokens)
+
+
+
+ def process_samples(self, masked_sentences, mask_logits, sampling_technique, temperature=1.0):
+ """
+ Process multiple masked sentences and fill their masks using the specified sampling technique.
+
+ Args:
+ masked_sentences (list): List of masked sentences.
+ mask_logits (dict): Logits for each [MASK] token in each sentence.
+ sampling_technique (str): Sampling technique to use.
+ temperature (float): Temperature parameter for sampling methods.
+
+ Returns:
+ list: List of sentences with masks filled.
+ """
+ filled_sentences = []
+ for sentence, logits in zip(masked_sentences, mask_logits):
+ filled_sentence = self.fill_masked_sentence(sentence, logits, sampling_technique, temperature)
+ filled_sentences.append(filled_sentence)
+ return filled_sentences
+
+# Example usage
+if __name__ == "__main__":
+ from transformers import BertTokenizer
+
+ # tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+ processor = SamplingProcessor(tokenizer)
+
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A quick brown dog outpaces a lazy fox.",
+ "Quick brown dog leaps over lazy the fox."
+ ]
+ result_dict = {
+ "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+ "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+ "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+ }
+
+
+ masking_processor = MaskingProcessor()
+ masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+ # masked_sentence = "The [MASK] brown fox jumps [MASK] the lazy dog."
+ # mask_logits = {
+ # 1: torch.randn(len(tokenizer)), # Example logits for first [MASK]
+ # 5: torch.randn(len(tokenizer)), # Example logits for second [MASK]
+ # }
+
+ # Iterate through masking results to apply sampling
+ for sentence, result in masking_results.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {result['masked_sentence']}")
+ # print(f"Mask Logits (Random): {output['mask_logits']}")
+ print(f' type(result["mask_logits"]) : {type(result["mask_logits"])}')
+ print(f' length of result["mask_logits"] : {len(result["mask_logits"])}')
+ print(f' result["mask_logits"].keys() : {result["mask_logits"].keys()}')
+ masked_sentence = result["masked_sentence"]
+ mask_logits = result["mask_logits"]
+
+ print(f"Original Masked Sentence: {masked_sentence}")
+
+ # Apply different sampling techniques
+ for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+ print(f"Sampling Technique: {technique}")
+
+ # Fill the masks using the sampling processor
+ filled_sentence = processor.fill_masked_sentence(
+ original_sentence=masked_sentence,
+ mask_logits=mask_logits,
+ sampling_technique=technique,
+ temperature=1.0 # Adjust temperature as needed
+ )
+
+ print(f"Filled Sentence: {filled_sentence}\n")
+ print('--------------------------------')
\ No newline at end of file
diff --git a/utils/old/sampling/sampling_methods_v2.py b/utils/old/sampling/sampling_methods_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8400ff361b3fa5ba500385c12a62128d744b09
--- /dev/null
+++ b/utils/old/sampling/sampling_methods_v2.py
@@ -0,0 +1,112 @@
+from transformers import pipeline
+import torch
+import random
+from masking_methods import MaskingProcessor
+
+
+class SamplingProcessorWithPipeline:
+ def __init__(self, model_name='bert-base-uncased'):
+ self.unmasker = pipeline('fill-mask', model=model_name)
+ self.tokenizer = self.unmasker.tokenizer
+
+ def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+ """
+ Fills each mask in the masked sentence using the specified sampling technique.
+
+ Args:
+ masked_sentence (str): Sentence with [MASK] tokens.
+ sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+ temperature (float): Temperature parameter for sampling methods.
+
+ Returns:
+ str: Sentence with the masks filled.
+ """
+ while '[MASK]' in masked_sentence:
+ # Get predictions for the first [MASK]
+ predictions = self.unmasker(masked_sentence)
+ print(f' predictions : {predictions}')
+ print(f' type of predictions : {type(predictions)}')
+
+ # Ensure predictions is a list of dictionaries
+ if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+ raise ValueError("Unexpected structure in predictions from the pipeline.")
+
+ # Extract logits (scores) from the predictions
+ logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ elif sampling_technique == "temperature":
+ logits = torch.clamp(logits, min=-1e8, max=1e8)
+ probs = torch.softmax(logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+ probs = probs / torch.sum(probs)
+ probs = probs.flatten()
+ if probs.size(0) > 1:
+ sampled_index = torch.multinomial(probs, 1).item()
+ else:
+ sampled_index = torch.argmax(probs).item()
+
+ elif sampling_technique == 'greedy':
+ sampled_index = torch.argmax(logits).item()
+
+ else:
+ raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # Replace the first [MASK] with the selected word
+ sampled_token = predictions[sampled_index]['token_str']
+ masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+
+ return masked_sentence
+
+
+# Example usage
+if __name__ == "__main__":
+ from transformers import BertTokenizer
+
+ # Define sentences and result_dict
+ sentences = [
+ "The quick brown fox jumps over the lazy dog.",
+ "A quick brown dog outpaces a lazy fox.",
+ "Quick brown dog leaps over lazy the fox."
+ ]
+ result_dict = {
+ "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+ "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+ "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+ }
+
+ masking_processor = MaskingProcessor()
+ masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+
+ # Use SamplingProcessor
+ sampling_processor = SamplingProcessorWithPipeline()
+
+ # Iterate through masking results to apply sampling
+ for sentence, result in masking_results.items():
+ print(f"Original Sentence (Random): {sentence}")
+ print(f"Masked Sentence (Random): {result['masked_sentence']}")
+ masked_sentence = result["masked_sentence"]
+
+ # Apply different sampling techniques
+ for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+ print(f"Sampling Technique: {technique}")
+ filled_sentence = sampling_processor.fill_masked_sentence(
+ masked_sentence=masked_sentence,
+ sampling_technique=technique,
+ temperature=1.0 # Adjust temperature as needed
+ )
+ print(f"Filled Sentence: {filled_sentence}\n")
+ print('--------------------------------')
diff --git a/utils/old/sampling_final_copy.py b/utils/old/sampling_final_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d970bbc48e4aff0dccf12a27ba3673fc84555053
--- /dev/null
+++ b/utils/old/sampling_final_copy.py
@@ -0,0 +1,168 @@
+import torch
+import random
+from masking_methods import MaskingProcessor
+
+class SamplingProcessor:
+ def __init__(self, tokenizer):
+ """
+ Initialize the SamplingProcessor.
+
+ Args:
+ tokenizer: BERT tokenizer instance
+ """
+ self.tokenizer = tokenizer
+
+ def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+ """
+ Sample tokens for each mask in the sentence using the specified sampling technique.
+
+ Args:
+ mask_logits_dict (dict): Dictionary of mask positions and their logits/tokens
+ masked_sentence (str): Sentence with [MASK] tokens
+ sampling_technique (str): Sampling method to use
+ temperature (float): Temperature parameter for sampling
+
+ Returns:
+ str: Sentence with sampled tokens replacing masks
+ """
+ words = masked_sentence.split()
+
+ # Convert positions and logits to sorted list to process masks in order
+ mask_positions = sorted(mask_logits_dict.keys())
+
+ for mask_pos in mask_positions:
+ mask_data = mask_logits_dict[mask_pos]
+ mask_logits = torch.tensor(mask_data['logits'])
+ candidate_tokens = mask_data['tokens']
+
+ try:
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ elif sampling_technique == "temperature":
+ mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8))
+ probs = probs / torch.sum(probs)
+ probs = probs.flatten()
+ if probs.size(0) > 1:
+ sampled_index = torch.multinomial(probs, 1).item()
+ else:
+ sampled_index = torch.argmax(probs).item()
+
+ elif sampling_technique == 'greedy':
+ sampled_index = torch.argmax(mask_logits).item()
+
+ else:
+ raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # Use the sampled index to get the corresponding token
+ sampled_token = candidate_tokens[sampled_index]
+ # Remove ## if it's a subword token
+ sampled_token = sampled_token.replace('##', '')
+ words[mask_pos] = sampled_token
+
+ except Exception as e:
+ print(f"Error sampling for position {mask_pos}: {str(e)}")
+ continue
+
+ return " ".join(words)
+
+ def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+ """
+ Process all masked sentences in the results dictionary.
+
+ Args:
+ results_dict (dict): Dictionary containing masked sentences and their logits
+ sampling_technique (str): Sampling method to use
+ temperature (float): Temperature parameter for sampling
+
+ Returns:
+ dict: Dictionary containing original, masked, and sampled sentences
+ """
+ processed_results = {}
+
+ for original_sentence, data in results_dict.items():
+ masked_sentence = data["masked_sentence"]
+ mask_logits = data["mask_logits"]
+
+ sampled_sentence = self.sample_tokens(
+ mask_logits,
+ masked_sentence,
+ sampling_technique,
+ temperature
+ )
+
+ processed_results[original_sentence] = {
+ "masked_sentence": masked_sentence,
+ "sampled_sentence": sampled_sentence
+ }
+
+ return processed_results
+
+
+if __name__ == "__main__":
+ sentences = [
+ "The quick brown fox jumps over the lazy dog everyday.",
+ "A speedy brown fox jumps over a lazy dog.",
+ "A swift brown fox leaps over the lethargic dog."
+
+ ]
+ result_dict ={
+ 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+ }
+
+ # First, mask the sentences
+ masking_processor = MaskingProcessor()
+ masking_results = masking_processor.process_sentences(sentences, result_dict)
+
+ # Then, sample replacements for the masks
+ sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+
+ # Try different sampling techniques
+ sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+
+ for technique in sampling_techniques:
+ print(f"\nSampling using {technique}:")
+ sampled_results = sampling_processor.process_masked_sentences(
+ masking_results,
+ sampling_technique=technique,
+ temperature=1.0
+ )
+
+ '''
+ {
+ "original_sentence_1":
+ {
+ "masked_sentence": "sentence with [MASK] tokens",
+ "sampling_method1": "sentence with sampled tokens",
+ },
+ "original_sentence_2":
+ {
+ "masked_sentence": "sentence with [MASK] tokens",
+ "sampling_method": "sentence with sampled tokens"
+ },
+ # ... and so on for each input sentence
+ },
+
+ '''
+
+ for original_sentence, result in sampled_results.items():
+ print(f"Original: {original_sentence}")
+ print(f"Masked: {result['masked_sentence']}")
+ print(f"Sampled: {result['sampled_sentence']}")
+ print("---")
+
diff --git a/utils/paraphraser.py b/utils/paraphraser.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf1034b93d87c6e5bd8748719248efd9461f43c
--- /dev/null
+++ b/utils/paraphraser.py
@@ -0,0 +1,75 @@
+"""
+This file contains the code to generate paraphrases of sentences.
+"""
+import os
+import sys
+import logging
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from tqdm import tqdm # for progress bars
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from utils.config import load_config
+# config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+# config = load_config(config_path)['PECCAVI_TEXT']['Paraphrase']
+
+# Configure logging to show only warnings or above on the terminal.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+class Paraphraser:
+ """
+ Paraphraser class to generate paraphrases of sentences.
+ """
+ def __init__(self, config):
+ self.config = config
+ import torch
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ tqdm.write(f"[Paraphraser] Initializing on device: {self.device}")
+ self.tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(config['model']).to(self.device)
+ self.num_beams = config['num_beams']
+ self.num_beam_groups = config['num_beam_groups']
+ self.num_return_sequences = config['num_return_sequences']
+ self.repetition_penalty = config['repetition_penalty']
+ self.diversity_penalty = config['diversity_penalty']
+ self.no_repeat_ngram_size = config['no_repeat_ngram_size']
+ self.temperature = config['temperature']
+ self.max_length = config['max_length']
+
+ def paraphrase(self, sentence: str, num_return_sequences: int=None, num_beams: int=None, num_beam_groups: int=None):
+ tqdm.write(f"[Paraphraser] Starting paraphrase for sentence: {sentence}")
+ if num_return_sequences is None:
+ num_return_sequences = self.num_return_sequences
+ if num_beams is None:
+ num_beams = self.num_beams
+ if num_beam_groups is None:
+ num_beam_groups = self.num_beam_groups
+
+ inputs = self.tokenizer.encode("paraphrase: " + sentence,
+ return_tensors="pt",
+ max_length=self.max_length,
+ truncation=True).to(self.device)
+ outputs = self.model.generate(
+ inputs,
+ max_length=self.max_length,
+ num_beams=num_beams,
+ num_beam_groups=num_beam_groups,
+ num_return_sequences=num_return_sequences,
+ repetition_penalty=self.repetition_penalty,
+ diversity_penalty=self.diversity_penalty,
+ no_repeat_ngram_size=self.no_repeat_ngram_size,
+ temperature=self.temperature
+ )
+ paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True)
+ for output in tqdm(outputs, desc="Decoding Paraphrases")]
+ tqdm.write(f"[Paraphraser] Paraphrase completed. {len(paraphrases)} outputs generated.")
+ return paraphrases
+
+if __name__ == "__main__":
+ config_path = '/home/jigyasu/PECCAVI-Text/utils/config.yaml'
+ config = load_config(config_path)
+ paraphraser = Paraphraser(config['PECCAVI_TEXT']['Paraphrase'])
+ sentence = "The quick brown fox jumps over the lazy dog."
+ paraphrases = paraphraser.paraphrase(sentence)
+ for paraphrase in paraphrases:
+ print(paraphrase)
\ No newline at end of file
diff --git a/utils/sampling.py b/utils/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..b341e8bec8864e801fd025e12a5ad574728e7de6
--- /dev/null
+++ b/utils/sampling.py
@@ -0,0 +1,181 @@
+import torch
+import random
+import logging
+from utils.masking_methods import MaskingProcessor
+from tqdm import tqdm
+
+# Configure logging to suppress INFO-level messages on the console.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+class SamplingProcessor:
+ def __init__(self, tokenizer):
+ """
+ Initialize the SamplingProcessor.
+
+ Args:
+ tokenizer: BERT tokenizer instance
+ """
+ self.tokenizer = tokenizer
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ tqdm.write(f"[SamplingProcessor] Initialized on device: {self.device}")
+
+ def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+ """
+ Sample tokens for each mask in the sentence using the specified sampling technique.
+
+ Args:
+ mask_logits_dict (dict): Dictionary of mask positions and their logits/tokens
+ masked_sentence (str): Sentence with [MASK] tokens
+ sampling_technique (str): Sampling method to use
+ temperature (float): Temperature parameter for sampling
+
+ Returns:
+ str: Sentence with sampled tokens replacing masks
+ """
+ tqdm.write(f"[SamplingProcessor] Sampling tokens for: {masked_sentence}")
+ print(f"[SamplingProcessor] Sampling tokens for: {masked_sentence}")
+ words = masked_sentence.split()
+ print(f"words: {words}")
+ # Convert positions and logits to sorted list to process masks in order
+ mask_positions = sorted(mask_logits_dict.keys())
+ print(f"mask_positions: {mask_positions}")
+
+ for mask_pos in mask_positions:
+ mask_data = mask_logits_dict[mask_pos]
+ # Move logits tensor to GPU
+ mask_logits = torch.tensor(mask_data['logits']).to(self.device)
+ candidate_tokens = mask_data['tokens']
+
+ try:
+ if sampling_technique == "inverse_transform":
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ cumulative_probs = torch.cumsum(probs, dim=-1)
+ random_prob = random.random()
+ sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+
+ elif sampling_technique == "exponential_minimum":
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ exp_probs = torch.exp(-torch.log(probs))
+ random_probs = torch.rand_like(exp_probs)
+ sampled_index = torch.argmax(random_probs * exp_probs).item()
+
+ elif sampling_technique == "temperature":
+ mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+ probs = torch.softmax(mask_logits / temperature, dim=-1)
+ if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+ raise ValueError("The computed probabilities contain NaN or inf values.")
+ probs = torch.max(probs, torch.tensor(1e-8).to(self.device))
+ probs = probs / torch.sum(probs)
+ probs = probs.flatten()
+ if probs.size(0) > 1:
+ sampled_index = torch.multinomial(probs, 1).item()
+ else:
+ sampled_index = torch.argmax(probs).item()
+
+ elif sampling_technique == 'greedy':
+ sampled_index = torch.argmax(mask_logits).item()
+
+ else:
+ raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+
+ # Use the sampled index to get the corresponding token
+ sampled_token = candidate_tokens[sampled_index]
+ # Remove ## if it's a subword token
+ sampled_token = sampled_token.replace('##', '')
+ words[mask_pos] = sampled_token
+ logger.info(f"Sampled token '{sampled_token}' for mask position {mask_pos}.")
+
+ except Exception as e:
+ logger.error(f"Error sampling for position {mask_pos}: {str(e)}")
+ continue
+
+ sampled_sentence = " ".join(words)
+ tqdm.write(f"[SamplingProcessor] Sampled sentence: {sampled_sentence}")
+ return sampled_sentence
+
+ def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+ """
+ Process all masked sentences in the results dictionary.
+
+ Args:
+ results_dict (dict): Dictionary containing masked sentences and their logits
+ sampling_technique (str): Sampling method to use
+ temperature (float): Temperature parameter for sampling
+
+ Returns:
+ dict: Dictionary containing original, masked, and sampled sentences
+ """
+ tqdm.write("[SamplingProcessor] Starting sampling for masked sentences.")
+ processed_results = {}
+ # Wrap the iteration over each original sentence with tqdm
+ for original_sentence, data in tqdm(results_dict.items(), desc="Sampling Masked Sentences"):
+ masked_sentence = data["masked_sentence"]
+ mask_logits = data["mask_logits"]
+
+ sampled_sentence = self.sample_tokens(mask_logits,
+ masked_sentence,
+ sampling_technique,
+ temperature)
+ processed_results[original_sentence] = {
+ "masked_sentence": masked_sentence,
+ "sampled_sentence": sampled_sentence
+ }
+ logger.info(f"Processed sampling for sentence: {original_sentence}")
+ tqdm.write("[SamplingProcessor] Completed sampling for all sentences.")
+ return processed_results
+
+
+if __name__ == "__main__":
+ sentences = [
+ "The quick brown fox jumps over the lazy dog everyday.",
+ "A speedy brown fox jumps over a lazy dog.",
+ "A swift brown fox leaps over the lethargic dog."
+ ]
+ result_dict = {
+ 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+ 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+ }
+
+ # First, mask the sentences
+ masking_processor = MaskingProcessor()
+ masking_results = masking_processor.process_sentences(sentences, result_dict)
+
+ # Then, sample replacements for the masks
+ sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+
+ # Try different sampling techniques
+ sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+
+ for technique in sampling_techniques:
+ logger.info(f"Sampling using technique: {technique}")
+ sampled_results = sampling_processor.process_masked_sentences(
+ masking_results,
+ sampling_technique=technique,
+ temperature=1.0
+ )
+
+ '''
+ {
+ "original_sentence_1":
+ {
+ "masked_sentence": "sentence with [MASK] tokens",
+ "sampling_method1": "sentence with sampled tokens",
+ },
+ "original_sentence_2":
+ {
+ "masked_sentence": "sentence with [MASK] tokens",
+ "sampling_method": "sentence with sampled tokens"
+ },
+ # ... and so on for each input sentence
+ },
+
+ '''
+
+ for original_sentence, result in sampled_results.items():
+ logger.info(f"Original: {original_sentence}")
+ logger.info(f"Masked: {result['masked_sentence']}")
+ logger.info(f"Sampled: {result['sampled_sentence']}")
+ logger.info("---")
+
diff --git a/utils/watermark.py b/utils/watermark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5ff6d5cac0ea84fe073125943d02193c8c66ae9
--- /dev/null
+++ b/utils/watermark.py
@@ -0,0 +1,352 @@
+"""
+This file contains the code to watermark given sentences using PECCAVI
+"""
+import os
+import sys
+import time
+import random
+import torch
+from utils.paraphraser import Paraphraser
+from utils.entailment import EntailmentAnalyzer
+from utils.sampling import SamplingProcessor
+# from tokenizer import tokenize_sentence, tokenize_sentences
+from utils.non_melting_point import NgramProcessor
+from utils.masking_methods import MaskingProcessor
+from tqdm import tqdm # add this import at the top if not already present
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from renderers.highlighter import highlight_common_words,reparaphrased_sentences_html
+from renderers.tree import generate_subplot1, generate_subplot2
+from renderers.plot_3d import gen_three_D_plot
+# from metrics.detectability import SentenceDetectabilityCalculator
+# from metrics.distortion import SentenceDistortionCalculator
+# from metrics.euclidean_distance import SentenceEuclideanDistanceCalculator
+from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM
+from transformers import BertTokenizer, BertForMaskedLM
+from pathlib import Path
+
+
+from utils.config import load_config
+import logging
+
+project_root = Path(__file__).parent.parent
+config_path = project_root / "utils" / "config.yaml"
+
+# Update logging configuration to reduce clutter
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+class Watermarker:
+ def __init__(self, config):
+ self.config = config
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ tqdm.write(f"[Watermarker] Initializing on device: {self.device}")
+ self.user_prompt = None
+ self.paraphrased_sentences = None
+ self.analyzed_paraphrased_sentences = None
+ self.selected_sentences = None
+ self.discarded_sentences = None
+ self.common_grams = None
+ # self.subsequences = None
+ self.common_grams_position = None
+ self.masked_sentences = None
+ self.masked_words = None
+ self.masked_logits = None
+ self.sampled_sentences = None
+ self.reparaphrased_sentences = None
+ self.distortion_list = None
+ self.detectability_list = None
+ self.euclidean_dist_list = None
+
+ self.masking_strategies = ['random', 'pseudorandom','entropy']
+ self.sampling_strategies = ['inverse_transform', 'exponential_minimum', 'temperature', 'greedy']
+ self.masking_results = dict()
+ self.sampling_results = dict()
+
+ # Move the model to GPU if available.
+ self.tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+ self.model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking").to(self.device)
+
+ self.paraphraser = Paraphraser(self.config['Paraphrase'])
+ self.entailment_analyzer = EntailmentAnalyzer(self.config['Entailment'])
+ self.ngram_processor = NgramProcessor()
+ self.masker = MaskingProcessor(self.tokenizer, self.model)
+ self.sampler = SamplingProcessor(self.tokenizer)
+
+ # self.detectability_calculator = SentenceDetectabilityCalculator(self.config['Metrics'])
+ # self.distortion_calculator = SentenceDistortionCalculator(self.config['Metrics'])
+ # self.euclidean_distance_calculator = SentenceEuclideanDistanceCalculator(self.config['Metrics'])
+
+
+ def Paraphrase(self, prompt:str, threshold:int=0.7):
+ """
+ This function paraphrases the given prompt using PECCAVI
+ Args:
+ prompt: str: The prompt to be paraphrased
+ threshold: int: The threshold for the similarity score
+ Returns:
+ str: The paraphrased sentence
+ """
+ start_time = time.time()
+ self.user_prompt = prompt
+ self.paraphrased_sentences = self.paraphraser.paraphrase(self.user_prompt)
+ if self.paraphrased_sentences is None:
+ print("Error in generating paraphrases", "Error: Could not complete step")
+ return None
+
+ self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = self.entailment_analyzer.analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold)
+
+ self.selected_sentences_list = [key for key in self.selected_sentences.keys()]
+ self.discarded_sentences_list = [key for key in self.discarded_sentences.keys()]
+ self.full_list = self.selected_sentences_list.copy()
+ self.full_list.extend(self.discarded_sentences_list)
+ self.full_list.append(self.user_prompt)
+
+
+ # self.user_prompt_tokenized = tokenize_sentence(self.user_prompt)
+ # self.selected_sentences_tokenized = tokenize_sentences(self.selected_sentences)
+ # self.discarded_sentences_tokenized = tokenize_sentences(self.discarded_sentences)
+
+ # all_tokenized_sentences = []
+ # all_tokenized_sentences.append(self.user_prompt_tokenized)
+ # all_tokenized_sentences.extend(self.selected_sentences_tokenized)
+ # all_tokenized_sentences.extend(self.discarded_sentences_tokenized)
+
+ self.common_grams = self.ngram_processor.find_filtered_ngrams(self.full_list)
+ print(f"Common grams: {self.common_grams}")
+
+ if self.user_prompt in self.full_list:
+ self.full_list.remove(self.user_prompt)
+
+ # highlighted_user_prompt = highlight_common_words(self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt")
+ # highlighted_accepted_sentences = highlight_common_words(self.common_grams, self.selected_sentences, "Highlighted LCS in the Accepted Sentences")
+ # highlighted_discarded_sentences = highlight_common_words(self.common_grams, self.discarded_sentences, "Highlighted LCS in the Discarded Sentences")
+
+ execution_time = time.time() - start_time
+ time_info = f"Step 1 completed in {execution_time:.2f} seconds"
+
+ # return [
+ # highlighted_user_prompt,
+ # highlighted_accepted_sentences,
+ # highlighted_discarded_sentences,
+ # time_info
+ # ]
+
+ def Masking(self) :
+ """
+ For each masking strategy in self.masking_strategies, mask the sentences in self.selected_sentences_list
+ Return structure:
+ {
+ "":
+ {
+ "Original sentence 1":
+ {
+ "masked_sentence": "The sentence with appropriate [MASK] tokens",
+ "mask_logits":
+ {
+ 3:
+ { # Example: mask index 3
+ "tokens": ["word1", "word2", ...], # Top predicted tokens
+ "logits": [score1, score2, ...] # Corresponding predicted scores
+ },
+ 7:
+ {
+ "tokens": ["wordA", "wordB", ...],
+ "logits": [scoreA, scoreB, ...]
+ },
+ # ... possibly additional mask positions
+ }
+ },
+ "Original sentence 2":
+ {
+ "masked_sentence": "Another masked sentence",
+ "mask_logits": { ... }
+ },
+ # ... more sentences processed for this strategy
+ },
+ "":
+ {
+ # Similar structure for each original sentence processed with masking_strategy2
+ },
+ # ... additional masking strategies if defined in self.masking_strategies
+ }
+ """
+ tqdm.write("[Watermarker] Starting Masking process.")
+ for strategy in self.masking_strategies:
+ tqdm.write(f"[Watermarker] Processing masking strategy: {strategy}")
+ results = self.masker.process_sentences(self.full_list, self.common_grams, strategy)
+ self.masking_results[strategy] = results
+ tqdm.write("[Watermarker] Masking process completed.")
+ return self.masking_results
+
+
+ def Sampling(self) :
+ """
+ For each masking strategy in self.masking_results, sample a sentence from the
+ masked sentences using the given sampling strategy.
+ Return structure:
+ {
+ "inverse_transform (SAMPLING STRATEGY)":
+ {
+ "random (MASKING STRATEGY)":
+ {
+ "Original sentence 1":
+ {
+ "masked_sentence": "Masked version of sentence 1",
+ "sampled_sentence": "Sampled version of sentence 1"
+ },
+ "Original sentence 2":
+ {
+ "masked_sentence": "Masked version of sentence 2",
+ "sampled_sentence": "Sampled version of sentence 2"
+ },
+ # ... additional original sentences
+ },
+ "pseudorandom":
+ {
+ # Similar structure for each original sentence
+ },
+ "entropy":
+ {
+ # Similar structure for each original sentence
+ },
+ },
+ "exponential_minimum":
+ {
+ # Similar nested dictionaries for each masking strategy and original sentence
+ },
+ "greedy":
+ {
+ # Similar nested dictionaries for each masking strategy and original sentence
+ }
+ }
+ """
+ tqdm.write("[Watermarker] Starting Sampling process.")
+ for strategy in self.sampling_strategies:
+ tqdm.write(f"[Watermarker] Processing sampling strategy: {strategy}")
+ self.sampling_results[strategy] = {}
+ for mask_strategy in self.masking_strategies:
+ results = self.sampler.process_masked_sentences(
+ self.masking_results[mask_strategy],
+ sampling_technique=strategy,
+ temperature=1.0
+ )
+ self.sampling_results[strategy][mask_strategy] = results
+ tqdm.write("[Watermarker] Sampling process completed.")
+ return self.sampling_results
+
+ def re_paraphrasing(self):
+ tqdm.write("[Watermarker] Starting re-paraphrasing process.")
+ self.reparaphrasing_results = {}
+ for sampling_strategy, mask_dict in tqdm(self.sampling_results.items(), desc="Sampling Strategies", leave=True):
+ self.reparaphrasing_results[sampling_strategy] = {}
+ for mask_strategy, sentences_data in tqdm(mask_dict.items(), desc="Masking Strategies", leave=False):
+ self.reparaphrasing_results[sampling_strategy][mask_strategy] = {}
+ for original_sentence, result in tqdm(sentences_data.items(), desc="Sentences", leave=False):
+ sampled_sentence = result.get("sampled_sentence", None)
+ if sampled_sentence:
+ new_paraphrases = self.paraphraser.paraphrase(sampled_sentence,
+ num_return_sequences=10,
+ num_beams=10)
+ else:
+ new_paraphrases = []
+ self.reparaphrasing_results[sampling_strategy][mask_strategy][original_sentence] = {
+ "masking_strategy": mask_strategy,
+ "sampling_strategy": sampling_strategy,
+ "sampled_sentence": sampled_sentence,
+ "re_paraphrased_sentences": new_paraphrases
+ }
+ tqdm.write("[Watermarker] Re-paraphrasing process completed.")
+ return self.reparaphrasing_results
+
+ def calculate_distortion(self):
+ return None
+
+if __name__ == "__main__":
+ # config_path = '/home/jigyasu/PECCAVI-Text/utils/config.yaml'
+ config = load_config(config_path)['PECCAVI_TEXT']
+ watermarker = Watermarker(config)
+
+ logger.info("Starting main Watermarker process.")
+ print("==> Paraphrasing:")
+ watermarker.Paraphrase("The quick brown fox jumps over small cat the lazy dog everyday again and again.")
+ logger.info("Paraphrasing completed.")
+
+ # Prepare a list to accumulate result strings
+ results_str = []
+ results_str.append("========== WATERMARKING RESULTS ==========\n\n")
+
+ # --- Step 2: Common N-grams ---
+ results_str.append("==> Common N-grams:\n")
+ if watermarker.common_grams:
+ for ngram, positions in watermarker.common_grams.items():
+ results_str.append(f" {ngram}: {positions}\n")
+ else:
+ results_str.append(" No common n-grams found.\n")
+
+ # --- Step 3: Selected Sentences ---
+ results_str.append("\n==> Selected Sentences:\n")
+ if watermarker.selected_sentences:
+ for sentence in watermarker.selected_sentences:
+ results_str.append(f" {sentence}\n")
+ else:
+ results_str.append(" No selected sentences available.\n")
+
+ # --- Step 4: Masking Results (without logits) ---
+ results_str.append("\n==> Masking Results:\n")
+ masking_results = watermarker.Masking()
+ for masking_strategy, results_dict in masking_results.items():
+ results_str.append(f"\n-- Masking Strategy: {masking_strategy} --\n")
+ for original_sentence, data in results_dict.items():
+ masked_sentence = data.get("masked_sentence", "")
+ results_str.append("Original:\n")
+ results_str.append(f" {original_sentence}\n")
+ results_str.append("Masked:\n")
+ results_str.append(f" {masked_sentence}\n")
+ results_str.append("-----\n")
+
+ # --- Step 5: Sampling Results ---
+ results_str.append("\n==> Sampling Results:\n")
+ sampling_results = watermarker.Sampling()
+ for sampling_strategy, mask_strategy_dict in sampling_results.items():
+ results_str.append(f"\n-- Sampling Strategy: {sampling_strategy} --\n")
+ for mask_strategy, sentences in mask_strategy_dict.items():
+ results_str.append(f"\n Masking Strategy: {mask_strategy}\n")
+ for original_sentence, res in sentences.items():
+ masked_sentence = res.get("masked_sentence", "")
+ sampled_sentence = res.get("sampled_sentence", "")
+ results_str.append(" Original:\n")
+ results_str.append(f" {original_sentence}\n")
+ results_str.append(" Masked:\n")
+ results_str.append(f" {masked_sentence}\n")
+ results_str.append(" Sampled:\n")
+ results_str.append(f" {sampled_sentence}\n")
+ results_str.append(" -----\n")
+
+ # --- Step 6: Re-paraphrasing Results ---
+ results_str.append("\n==> Re-paraphrasing Results:\n")
+ reparaphrasing_results = watermarker.re_paraphrasing()
+ for sampling_strategy, mask_dict in reparaphrasing_results.items():
+ results_str.append(f"\n-- Sampling Strategy: {sampling_strategy} --\n")
+ for mask_strategy, orig_sentence_dict in mask_dict.items():
+ results_str.append(f"\n Masking Strategy: {mask_strategy}\n")
+ for original_sentence, data in orig_sentence_dict.items():
+ sampled_sentence = data.get("sampled_sentence", "")
+ re_paraphrases = data.get("re_paraphrased_sentences", [])
+ results_str.append(" Original:\n")
+ results_str.append(f" {original_sentence}\n")
+ results_str.append(" Sampled:\n")
+ results_str.append(f" {sampled_sentence}\n")
+ results_str.append(" Re-paraphrased (first 3 examples):\n")
+ # Display only the first 3 re-paraphrases for brevity
+ for idx, rp in enumerate(re_paraphrases[:3]):
+ results_str.append(f" {idx+1}. {rp}\n")
+ results_str.append(" -----\n")
+
+ # Write all results to the output file
+ output_file = "watermarking_results.txt"
+ with open(output_file, "w", encoding="utf-8") as f:
+ f.writelines(results_str)
+
+ logger.info("Writing results to output file.")
+ print("\nResults have been written to", output_file)
\ No newline at end of file
diff --git a/utils/watermarking_results.txt b/utils/watermarking_results.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6db638350a643fb4b6aa28b785fe36d8295bf3
--- /dev/null
+++ b/utils/watermarking_results.txt
@@ -0,0 +1,547 @@
+========== WATERMARKING RESULTS ==========
+
+==> Common N-grams:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.: {'brown fox': [(2, 3)], 'dog': [(9, 9)], 'small': [(12, 12)]}
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.: {'brown fox': [(3, 4)], 'dog': [(9, 9)], 'small': [(12, 12)]}
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.: {'brown fox': [(2, 3)], 'dog': [(10, 10)], 'small': [(6, 6)]}
+
+==> Selected Sentences:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+
+==> Masking Results:
+
+-- Masking Strategy: random --
+Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+-----
+Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+Masked:
+ Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] .
+-----
+Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+Masked:
+ The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again .
+-----
+
+-- Masking Strategy: pseudorandom --
+Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+-----
+Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+Masked:
+ [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] .
+-----
+Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+Masked:
+ The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again .
+-----
+
+-- Masking Strategy: entropy --
+Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+-----
+Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+Masked:
+ [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] .
+-----
+Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+Masked:
+ The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again .
+-----
+
+==> Sampling Results:
+
+-- Sampling Strategy: inverse_transform --
+
+ Masking Strategy: random
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A massive brown fox frequently leaps over the neighborhood dog, who is small, on a regular basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] .
+ Sampled:
+ Repeatedly, the little brown fox leaps over an adult dog and its small body .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again .
+ Sampled:
+ The large brown fox jumps over the bucks the lazy dog everyday again and again .
+ -----
+
+ Masking Strategy: pseudorandom
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A big brown fox frequently leaps over the white dog, who is small, on a regular basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] .
+ Sampled:
+ There the rapid brown fox jumps over an inactive dog and its small owner .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again .
+ Sampled:
+ The gray brown fox jumps over the cat the lazy brown everyday again and again .
+ -----
+
+ Masking Strategy: entropy
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A huge brown fox frequently leaps over the guard dog, who is small, on a consistent basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] .
+ Sampled:
+ Suddenly the rapid brown fox leaps over an imaginary dog and its small tail .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again .
+ Sampled:
+ The big brown fox jumps over small cat the black dog everyday again and again .
+ -----
+
+-- Sampling Strategy: exponential_minimum --
+
+ Masking Strategy: random
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A hunting brown fox frequently leaps over the domestic dog, who is small, on a large basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] .
+ Sampled:
+ Repeatedly, the smaller brown fox leaps over an unfortunate dog and its small collar .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again .
+ Sampled:
+ The night brown fox jumps over he bird the lazy dog everyday again and again .
+ -----
+
+ Masking Strategy: pseudorandom
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A male brown fox frequently leaps over the opposing dog, who is small, on a game basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] .
+ Sampled:
+ Only the rapid brown fox sits over an inactive dog and its small slave .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again .
+ Sampled:
+ The noisy brown fox jumps over he cat the lazy to everyday again and again .
+ -----
+
+ Masking Strategy: entropy
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A hunting brown fox frequently leaps over the domestic dog, who is small, on a game basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] .
+ Sampled:
+ With the rapid brown fox leaps over an enthusiastic dog and its small target .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again .
+ Sampled:
+ The brave brown fox jumps over small cat the grey dog everyday again and again .
+ -----
+
+-- Sampling Strategy: greedy --
+
+ Masking Strategy: random
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] .
+ Sampled:
+ Repeatedly, the great brown fox leaps over an old dog and its small owner .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again .
+ Sampled:
+ The big brown fox jumps over the rabbit the lazy dog everyday again and again .
+ -----
+
+ Masking Strategy: pseudorandom
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] .
+ Sampled:
+ Suddenly the rapid brown fox jumps over an inactive dog and its small owner .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again .
+ Sampled:
+ The big brown fox jumps over the cat the lazy brown everyday again and again .
+ -----
+
+ Masking Strategy: entropy
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Masked:
+ A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis .
+ Sampled:
+ A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis .
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Masked:
+ [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] .
+ Sampled:
+ Suddenly the rapid brown fox leaps over an old dog and its small owner .
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Masked:
+ The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again .
+ Sampled:
+ The big brown fox jumps over small cat the small dog everyday again and again .
+ -----
+
+==> Re-paraphrasing Results:
+
+-- Sampling Strategy: inverse_transform --
+
+ Masking Strategy: random
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A massive brown fox frequently leaps over the neighborhood dog, who is small, on a regular basis .
+ Re-paraphrased (first 3 examples):
+ 1. The small dog in the neighborhood is frequently jumped by a massive brown fox.
+ 2. A large brown fox frequently jumps over the small dog that lives nearby.
+ 3. The neighborhood dog, which is small, is frequently jumped by a massive brown fox.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Repeatedly, the little brown fox leaps over an adult dog and its small body .
+ Re-paraphrased (first 3 examples):
+ 1. The small brown fox repeatedly jumps over an adult dog and its tiny body.
+ 2. Repeatedly, the small brown fox jumps over an adult dog and its tiny body.
+ 3. On numerous occasions, the small brown fox leaps over an adult dog and its tiny body.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The large brown fox jumps over the bucks the lazy dog everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every day, the big brown fox jumps over the lazy dog's body.
+ 2. The oversized brown fox jumps over the lazy dog's body on a regular basis.
+ 3. A persistent action is the jumping of the large brown fox over the lazy dog's body.
+ -----
+
+ Masking Strategy: pseudorandom
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A big brown fox frequently leaps over the white dog, who is small, on a regular basis .
+ Re-paraphrased (first 3 examples):
+ 1. The small white dog is frequently jumped by a large brown fox.
+ 2. It is common for a large brown fox to jump over the small white dog.
+ 3. A large brown fox often jumps over the small white dog.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ There the rapid brown fox jumps over an inactive dog and its small owner .
+ Re-paraphrased (first 3 examples):
+ 1. A speedy brown fox leaps over a doggy and its small master.
+ 2. The swift brown fox leaps over an idle dog and its small owner.
+ 3. An agile brown fox swiftly leaps over a dour dog and its small master.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The gray brown fox jumps over the cat the lazy brown everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every day, the gray brown fox jumps over the lazy brown cat.
+ 2. On a regular basis, the gray brown fox jumps over the lazy brown cat.
+ 3. The gray brown fox repeatedly jumps over the lazy brown cat.
+ -----
+
+ Masking Strategy: entropy
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A huge brown fox frequently leaps over the guard dog, who is small, on a consistent basis .
+ Re-paraphrased (first 3 examples):
+ 1. The guard dog, which is small, is frequently jumped over by a massive brown fox.
+ 2. A small guard dog is regularly jumped by a massive brown fox.
+ 3. Every so often a large brown fox jumps over the small guard dog.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Suddenly the rapid brown fox leaps over an imaginary dog and its small tail .
+ Re-paraphrased (first 3 examples):
+ 1. The swift brown fox instinctively jumps over an imaginary dog and its tiny tail.
+ 2. Suddenly, the swift brown fox jumps over an imaginary dog and its tiny tail.
+ 3. In an instant, the swift brown fox jumps over a make-believe dog and its tiny tail.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The big brown fox jumps over small cat the black dog everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. The big brown fox repeatedly jumps over the small cat and black dog.
+ 2. Every day, the large brown fox jumps over a small cat and black dog.
+ 3. On a regular basis, the large brown fox jumps over the little cat and black dog.
+ -----
+
+-- Sampling Strategy: exponential_minimum --
+
+ Masking Strategy: random
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A hunting brown fox frequently leaps over the domestic dog, who is small, on a large basis .
+ Re-paraphrased (first 3 examples):
+ 1. The domestic dog, despite its size, is frequently leapt over by a brown fox engaged in hunting.
+ 2. A brown fox used for hunting is known to jump over the small domestic dog on large ground.
+ 3. The small domestic dog is frequently jumped over by a hunting brown fox on 'large grounds'.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Repeatedly, the smaller brown fox leaps over an unfortunate dog and its small collar .
+ Re-paraphrased (first 3 examples):
+ 1. The small brown fox repeatedly jumps over an unfortunate dog and its tiny collar.
+ 2. Repeatedly, the smaller brown fox jumps over an unfortunate dog and its small collar.
+ 3. On a regular basis, the smaller brown fox jumps over an unfortunate dog and its small collar.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The night brown fox jumps over he bird the lazy dog everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. The bird the lazy dog is frequently jumped over by the night brown fox.
+ 2. Every so often, the brown fox of the night jumps over the bird's lazy dog.
+ 3. On a regular basis, the brown fox of night jumps over the lazy dog bird.
+ -----
+
+ Masking Strategy: pseudorandom
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A male brown fox frequently leaps over the opposing dog, who is small, on a game basis .
+ Re-paraphrased (first 3 examples):
+ 1. On occasion, a male brown fox jumps over the small dog that is on the other side as part of its game.
+ 2. The male brown fox is known to jump over the small dog in game on a regular basis.
+ 3. During game, a male brown fox frequently jumps over the small dog that is on the same side.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Only the rapid brown fox sits over an inactive dog and its small slave .
+ Re-paraphrased (first 3 examples):
+ 1. An inactive dog and its small slave are accompanied by only one swift brown fox.
+ 2. The only creature that sits on top of an inactive dog and its small slave is a swift brown fox.
+ 3. A slow brown fox is the sole entity leaning against an inactive dog and its small slave.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The noisy brown fox jumps over he cat the lazy to everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every once in a while, the noisy brown fox jumps over his lazy lazy cat.
+ 2. Each time he touches the lazy cat, the brown fox, who is loud, jumps over it.
+ 3. The shrieking brown fox jumps over his lazy lazy owner on a regular basis.
+ -----
+
+ Masking Strategy: entropy
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A hunting brown fox frequently leaps over the domestic dog, who is small, on a game basis .
+ Re-paraphrased (first 3 examples):
+ 1. The small domestic dog is frequently jumped over by a hunting brown fox when it's playing.
+ 2. A brown fox, which is often used for hunting purposes, jumps over a small domestic dog when it comes in contact with the game.
+ 3. For hunting purposes, a brown fox often jumps over the small domestic dog.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ With the rapid brown fox leaps over an enthusiastic dog and its small target .
+ Re-paraphrased (first 3 examples):
+ 1. The swift brown fox leaps over a lively dog and its tiny prey.
+ 2. A speedy brown fox leaps over a playful dog and its small prey.
+ 3. An eager dog and its small prey are swiftly spooked by the fast-moving brown fox.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The brave brown fox jumps over small cat the grey dog everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every day, the courageous brown fox jumps over a small cat and grey dog.
+ 2. The fearless brown fox repeatedly jumps over the small cat and grey dog.
+ 3. The courageous brown fox repeatedly jumps over the grey dog and small cat.
+ -----
+
+-- Sampling Strategy: greedy --
+
+ Masking Strategy: random
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis .
+ Re-paraphrased (first 3 examples):
+ 1. The nearest dog is frequently swarmed by a big brown fox.
+ 2. It is common for a large brown fox to jump over the nearest dog, even though it is small.
+ 3. A large brown fox is known to jump over the nearest dog, which is a small canine.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Repeatedly, the great brown fox leaps over an old dog and its small owner .
+ Re-paraphrased (first 3 examples):
+ 1. The great brown fox repeatedly jumps over an elderly dog and its small owner.
+ 2. On numerous occasions, the great brown fox jumps over an elderly dog and its small owner.
+ 3. An elderly dog and its small owner are repeatedly jumped by the great brown fox.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The big brown fox jumps over the rabbit the lazy dog everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every day, the large brown fox jumps over the lazy dog and falls back down.
+ 2. On a regular basis, the large brown fox jumps over the lazy dog and rabbit.
+ 3. Each time the unruly rabbit and the lazy dog are spotted, the big brown fox jumps over them.
+ -----
+
+ Masking Strategy: pseudorandom
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis .
+ Re-paraphrased (first 3 examples):
+ 1. The nearest dog is frequently swarmed by a big brown fox.
+ 2. It is common for a large brown fox to jump over the nearest dog, even though it is small.
+ 3. A large brown fox is known to jump over the nearest dog, which is a small canine.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Suddenly the rapid brown fox jumps over an inactive dog and its small owner .
+ Re-paraphrased (first 3 examples):
+ 1. A speedy brown fox leaps over a doggy and its small owner without any prior thought.
+ 2. In an instant, the swift brown fox leaps over a doddery dog and its small owner.
+ 3. The swift brown fox leaps over an idle dog and its small owner without warning.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The big brown fox jumps over the cat the lazy brown everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every now and then, the large brown fox jumps over the lazy brown cat.
+ 2. On a regular basis, the large brown fox jumps over the lazy brown cat.
+ 3. The large brown fox repeatedly jumps over the lazy brown cat.
+ -----
+
+ Masking Strategy: entropy
+ Original:
+ A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.
+ Sampled:
+ A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis .
+ Re-paraphrased (first 3 examples):
+ 1. The nearest dog is frequently swarmed by a big brown fox.
+ 2. It is common for a large brown fox to jump over the nearest dog, even though it is small.
+ 3. A large brown fox is known to jump over the nearest dog, which is a small canine.
+ -----
+ Original:
+ Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.
+ Sampled:
+ Suddenly the rapid brown fox leaps over an old dog and its small owner .
+ Re-paraphrased (first 3 examples):
+ 1. The brown fox, which is fast and agile, suddenly jumps over an elderly dog and its small owner.
+ 2. A speedy brown fox suddenly jumps over an old dog and its small owner.
+ 3. In an instant, the swift brown fox leaps over a small owner and an elderly dog.
+ -----
+ Original:
+ The quick brown fox jumps over small cat the lazy dog everyday again and again.
+ Sampled:
+ The big brown fox jumps over small cat the small dog everyday again and again .
+ Re-paraphrased (first 3 examples):
+ 1. Every day, the big brown fox jumps over a small dog and cat.
+ 2. The big brown fox repeatedly jumps over the little cat and dog.
+ 3. Every now and then, the big brown fox jumps over a small dog or cat.
+ -----