diff --git a/UI/__pycache__/gradio.cpython-310.pyc b/UI/__pycache__/gradio.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e268c233039b5025d099cbfab83b8e032b20507a Binary files /dev/null and b/UI/__pycache__/gradio.cpython-310.pyc differ diff --git a/UI/__pycache__/gradio.cpython-311.pyc b/UI/__pycache__/gradio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2de60032ba3110fc885333ad0734831e2570a162 Binary files /dev/null and b/UI/__pycache__/gradio.cpython-311.pyc differ diff --git a/UI/gradio.py b/UI/gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..eec1b35b24ea316265bccd78fe16548988499a75 --- /dev/null +++ b/UI/gradio.py @@ -0,0 +1,516 @@ +import gradio as gr +from utils.watermark import Watermarker +from utils.config import load_config +from renderers.highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html +from renderers.tree import generate_subplot1, generate_subplot2 +from pathlib import Path +import time +from typing import Dict, List, Tuple, Any +import plotly.graph_objects as go + +class WatermarkerInterface: + def __init__(self, config): + + self.pipeline = Watermarker(config) + self.common_grams = {} + self.highlight_info = [] + self.masked_sentences = [] + + def handle_paraphrase(self, prompt: str) -> Tuple[str, str, str, str]: + """Wrapper for paraphrasing that includes highlighting""" + start_time = time.time() + + # Run paraphrasing + self.pipeline.Paraphrase(prompt) + + # Step 1: Process the original sentence first + seen_ngrams = {} # Stores first occurrence index of each n-gram + original_indexed_ngrams = [] # Final indexed list for original + + original_sentence = self.pipeline.user_prompt + original_ngrams = self.pipeline.common_grams.get(original_sentence, {}) + + # Step 1.1: Extract n-grams and their first occurrence index + ngram_occurrences = [ + (min(indices, key=lambda x: x[0])[0], gram) # Get first index + for gram, indices in original_ngrams.items() + ] + + # Step 1.2: Sort n-grams based on their first occurrence + ngram_occurrences.sort() + + # Step 1.3: Assign sequential indices + for idx, (position, gram) in enumerate(ngram_occurrences, start=1): + seen_ngrams[gram] = idx # Assign sequential index + original_indexed_ngrams.append((idx, gram)) + + print("Original Indexed N-grams:", original_indexed_ngrams) + + #generate highlight_info + colors = ["red", "blue", "green", "purple", "orange"] + highlight_info = [ + (ngram, colors[i % len(colors)]) + for i, (index, ngram) in enumerate(original_indexed_ngrams) + ] + common_grams = original_indexed_ngrams + self.highlight_info = highlight_info + self.common_grams = common_grams + + # Step 2: Process paraphrased sentences and match indices + paraphrase_indexed_ngrams = {} + + for sentence in self.pipeline.paraphrased_sentences: + sentence_ngrams = [] # Stores n-grams for this sentence + sentence_ngrams_dict = self.pipeline.common_grams.get(sentence, {}) + + for gram, indices in sentence_ngrams_dict.items(): + first_occurrence = min(indices, key=lambda x: x[0])[0] + + # Use the original's index if exists, otherwise assign a new one + if gram in seen_ngrams: + index = seen_ngrams[gram] # Use the same index as original + else: + index = len(seen_ngrams) + 1 # Assign new index + seen_ngrams[gram] = index # Store it + + sentence_ngrams.append((index, gram)) + + sentence_ngrams.sort() + paraphrase_indexed_ngrams[sentence] = sentence_ngrams + + print("Paraphrase Indexed N-grams:", paraphrase_indexed_ngrams) + + # Step 3: Generate highlighted versions using the renderer + highlighted_prompt = highlight_common_words( + common_grams, + [self.pipeline.user_prompt], + "Original Prompt with Highlighted Common Sequences" + ) + + highlighted_accepted = highlight_common_words_dict( + common_grams, + self.pipeline.selected_sentences, + "Accepted Paraphrased Sentences with Entailment Scores" + ) + + highlighted_discarded = highlight_common_words_dict( + common_grams, + self.pipeline.discarded_sentences, + "Discarded Paraphrased Sentences with Entailment Scores" + ) + + execution_time = f"
Step 1 completed in {time.time() - start_time:.2f} seconds
" + self.highlight_info = highlight_info + self.common_grams = common_grams + + return highlighted_prompt, highlighted_accepted, highlighted_discarded, execution_time + + def handle_masking(self) -> Tuple[List[go.Figure], str]: + """Wrapper for masking that generates visualization trees""" + start_time = time.time() + + masking_results = self.pipeline.Masking() + trees = [] + highlight_info = self.highlight_info + common_grams = self.common_grams + sentence_to_masked = {} + + # Create a consolidated figure with all strategies + original_sentence = None + + # First pass - gather all sentences and strategies + for strategy, sentence_dict in masking_results.items(): + for sent, data in sentence_dict.items(): + if sent not in sentence_to_masked: + sentence_to_masked[sent] = [] + try: + if not isinstance(data, dict): + print(f"[ERROR] Data is not a dictionary for {sent} with strategy {strategy}") + continue + + masked_sentence = data.get("masked_sentence", "") + if masked_sentence: + sentence_to_masked[sent].append((masked_sentence, strategy)) + except Exception as e: + print(f"Error processing {strategy} for sentence {sent}: {e}") + + for original_sentence, masked_sentences_data in sentence_to_masked.items(): + if not masked_sentences_data: + continue + masked_sentences = [ms[0] for ms in masked_sentences_data] + strategies = [ms[1] for ms in masked_sentences_data] + try: + + fig = generate_subplot1( + original_sentence, + masked_sentences, + strategies, + highlight_info, + common_grams + ) + trees.append(fig) + except Exception as e: + print(f"Error generating multi-strategy tree: {e}") + trees.append(go.Figure()) + + # Pad with empty plots if needed + while len(trees) < 10: + trees.append(go.Figure()) + + execution_time = f"
Step 2 completed in {time.time() - start_time:.2f} seconds
" + + return trees[:10] + [execution_time] + + def handle_sampling(self) -> Tuple[List[go.Figure], str]: + """Wrapper for sampling that generates visualization trees""" + start_time = time.time() + sampling_results = self.pipeline.Sampling() + trees = [] + + # Group sentences by original sentence + organized_results = {} + + # Generate trees for each sampled sentence + for sampling_strategy, masking_dict in sampling_results.items(): + for masking_strategy, sentences in masking_dict.items(): + for original_sentence, data in sentences.items(): + if original_sentence not in organized_results: + organized_results[original_sentence] = {} + + if masking_strategy not in organized_results[original_sentence]: + organized_results[original_sentence][masking_strategy] = { + "masked_sentence": data.get("masked_sentence", ""), # Corrected reference + "sampled_sentences": {} + } + + # Add this sampling result + organized_results[original_sentence][masking_strategy]["sampled_sentences"][sampling_strategy] = data.get("sampled_sentence", "") + + for original_sentence, data in organized_results.items(): + masked_sentences = [] + all_sampled_sentences = [] + + for masking_strategy, masking_data in list(data.items())[:3]: # Ensure this iteration is safe + masked_sentence = masking_data.get("masked_sentence", "") + if masked_sentence: + masked_sentences.append(masked_sentence) + + for sampling_strategy, sampled_sentence in masking_data.get("sampled_sentences", {}).items(): + if sampled_sentence: + all_sampled_sentences.append(sampled_sentence) + + if masked_sentences: + try: + fig = generate_subplot2( + masked_sentences, + all_sampled_sentences, + self.highlight_info, + self.common_grams + ) + trees.append(fig) + except Exception as e: + print(f"Error generating subplot for {original_sentence}: {e}") + trees.append(go.Figure()) + + while len(trees) < 10: + trees.append(go.Figure()) + + execution_time = f"
Step 3 completed in {time.time() - start_time:.2f} seconds
" + + return trees[:10] + [execution_time] + + def handle_reparaphrasing(self) -> Tuple[List[str], str]: + """Wrapper for re-paraphrasing that formats results as HTML""" + start_time = time.time() + + results = self.pipeline.re_paraphrasing() + html_outputs = [] + + # Generate HTML for each batch of re-paraphrased sentences + for sampling_strategy, masking_dict in results.items(): + for masking_strategy, sentences in masking_dict.items(): + for original_sent, data in sentences.items(): + if data["re_paraphrased_sentences"]: + html = reparaphrased_sentences_html(data["re_paraphrased_sentences"]) + html_outputs.append(html) + + # Pad with empty HTML if needed + while len(html_outputs) < 120: + html_outputs.append("") + + execution_time = f"
Step 4 completed in {time.time() - start_time:.2f} seconds
" + + return html_outputs[:120] + [execution_time] + + +def create_gradio_interface(config): + """Creates the Gradio interface with the updated pipeline""" + interface = WatermarkerInterface(config) + + with gr.Blocks(theme=gr.themes.Monochrome()) as demo: + #CSS to enable scrolling for reparaphrased sentences and sampling plots + demo.css = """ +/* Set fixed height for the reparaphrased tabs container only */ +.gradio-container .tabs[id="reparaphrased-tabs"], +.gradio-container .tabs[id="sampling-tabs"] { + overflow-x: hidden; + white-space: normal; + border-radius: 8px; + max-height: 600px; /* Set fixed height for the entire tabs component */ + overflow-y: auto; /* Enable vertical scrolling inside the container */ +} + +/* Tab content styling for reparaphrased and sampling tabs */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tabitem, +.gradio-container .tabs[id="sampling-tabs"] .tabitem { + overflow-x: hidden; + white-space: normal; + display: block; + border-radius: 8px; +} + +/* Make the tab navigation fixed at the top for scrollable tabs */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav { + display: flex; + overflow-x: auto; + white-space: nowrap; + scrollbar-width: thin; + border-radius: 8px; + scrollbar-color: #888 #f1f1f1; + position: sticky; + top: 0; + background: white; + z-index: 100; +} + +/* Dropdown menu for scrollable tabs styling */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown { + position: relative; + display: inline-block; +} + +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content { + display: none; + position: absolute; + background-color: #f9f9f9; + min-width: 160px; + box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); + z-index: 1; + max-height: 300px; + overflow-y: auto; +} + +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content { + display: block; +} + +/* Scrollbar styling for scrollable tabs */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar { + height: 8px; + border-radius: 8px; +} + +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-track, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-track { + background: #f1f1f1; + border-radius: 8px; +} + +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-thumb, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-thumb { + background: #888; + border-radius: 8px; +} + +/* Tab button styling for scrollable tabs */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-item, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-item { + flex: 0 0 auto; + border-radius: 8px; +} + +/* Plot container styling specifically for sampling tabs */ +.gradio-container .tabs[id="sampling-tabs"] .plot-container { + min-height: 600px; + max-height: 1800px; + overflow-y: auto; +} + +/* Ensure text wraps in HTML components */ +.gradio-container .prose { + white-space: normal; + word-wrap: break-word; + overflow-wrap: break-word; +} + +/* Dropdown button styling for scrollable tabs */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button { + background-color: #f0f0f0; + border: 1px solid #ddd; + border-radius: 4px; + padding: 5px 10px; + cursor: pointer; + margin: 2px; +} + +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button:hover, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button:hover { + background-color: #e0e0e0; +} + +/* Style dropdown content items for scrollable tabs */ +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div { + padding: 8px 12px; + cursor: pointer; +} + +.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div:hover, +.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div:hover { + background-color: #e0e0e0; +} + +/* Custom styling for execution time display */ +.execution-time { + text-align: right; + padding: 8px 16px; + font-family: inherit; + color: #555; + font-size: 0.9rem; + font-style: italic; + margin-left: auto; + width: 100%; + border-top: 1px solid #eee; + margin-top: 8px; +} + +/* Layout for section headers with execution time */ +.section-header { + display: flex; + justify-content: space-between; + align-items: center; + width: 100%; + margin-bottom: 12px; +} + +.section-header h3 { + margin: 0; +} +""" + gr.Markdown("# **AIISC Watermarking Model**") + + with gr.Column(): + gr.Markdown("## Input Prompt") + user_input = gr.Textbox( + label="Enter Your Prompt", + placeholder="Type your text here..." + ) + + with gr.Row(): + with gr.Column(scale=3): + gr.Markdown("## Step 1: Paraphrasing, LCS and Entailment Analysis") + with gr.Column(scale=1): + step1_time = gr.HTML() + + paraphrase_button = gr.Button("Generate Paraphrases") + highlighted_user_prompt = gr.HTML(label="Highlighted User Prompt") + + with gr.Tabs(): + with gr.TabItem("Accepted Paraphrased Sentences"): + highlighted_accepted_sentences = gr.HTML() + with gr.TabItem("Discarded Paraphrased Sentences"): + highlighted_discarded_sentences = gr.HTML() + + with gr.Row(): + with gr.Column(scale=3): + gr.Markdown("## Step 2: Where to Mask?") + with gr.Column(scale=1): + step2_time = gr.HTML() + + masking_button = gr.Button("Apply Masking") + gr.Markdown("### Masked Sentence Trees") + tree1_plots = [] + with gr.Tabs() as tree1_tabs: + for i in range(10): + with gr.TabItem(f"Masked Sentence {i+1}"): + tree1 = gr.Plot() + tree1_plots.append(tree1) + + with gr.Row(): + with gr.Column(scale=3): + gr.Markdown("## Step 3: How to Mask?") + with gr.Column(scale=1): + step3_time = gr.HTML() + + sampling_button = gr.Button("Sample Words") + gr.Markdown("### Sampled Sentence Trees") + + tree2_plots = [] + # Add elem_id to make this tab container scrollable + with gr.Tabs(elem_id="sampling-tabs") as tree2_tabs: + for i in range(10): + with gr.TabItem(f"Sampled Sentence {i+1}"): + # Add a custom class to the container to enable proper styling + with gr.Column(elem_classes=["plot-container"]): + tree2 = gr.Plot() + tree2_plots.append(tree2) + + with gr.Row(): + with gr.Column(scale=3): + gr.Markdown("## Step 4: Re-paraphrasing") + with gr.Column(scale=1): + step4_time = gr.HTML() + + reparaphrase_button = gr.Button("Re-paraphrase") + gr.Markdown("### Reparaphrased Sentences") + reparaphrased_sentences_tabs = [] + with gr.Tabs(elem_id="reparaphrased-tabs") as reparaphrased_tabs: + for i in range(120): + with gr.TabItem(f"Reparaphrased Batch {i+1}"): + reparaphrased_sent_html = gr.HTML() + reparaphrased_sentences_tabs.append(reparaphrased_sent_html) + + # Connect the interface functions to the buttons + paraphrase_button.click( + interface.handle_paraphrase, + inputs=user_input, + outputs=[ + highlighted_user_prompt, + highlighted_accepted_sentences, + highlighted_discarded_sentences, + step1_time + ] + ) + + masking_button.click( + interface.handle_masking, + inputs=None, + outputs=tree1_plots + [step2_time] + ) + + sampling_button.click( + interface.handle_sampling, + inputs=None, + outputs=tree2_plots + [step3_time] + ) + + reparaphrase_button.click( + interface.handle_reparaphrasing, + inputs=None, + outputs=reparaphrased_sentences_tabs + [step4_time] + ) + + return demo + +if __name__ == "__main__": + project_root = Path(__file__).parent.parent + config_path = project_root / "utils" / "config.yaml" + config = load_config(config_path)['PECCAVI_TEXT'] + + create_gradio_interface(config).launch() \ No newline at end of file diff --git a/__pycache__/app.cpython-310.pyc b/__pycache__/app.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94f422246f604689537b70b161c46fc265e03b32 Binary files /dev/null and b/__pycache__/app.cpython-310.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..f008ea4cd351a03df9745e357487f55888faaaeb --- /dev/null +++ b/app.py @@ -0,0 +1,21 @@ + +import gradio as gr +from UI.gradio import create_gradio_interface + +from pathlib import Path +from utils.config import load_config + +project_root = Path(__file__).resolve().parent +config_path = project_root / "utils" / "config.yaml" +config = load_config(config_path)['PECCAVI_TEXT'] + +def main(): + """ + This function is the entry point for the PECCAVI Watermarking Model. + + It creates the Gradio interface for the model and runs it. + """ + create_gradio_interface(config).launch() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..7c6c7b53a6b89d6d2e7eb4487fea47cb0d4b1ed6 --- /dev/null +++ b/environment.yml @@ -0,0 +1,245 @@ +name: panda +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - asttokens=2.4.1=pyhd8ed1ab_0 + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2024.8.30=hbcca054_0 + - comm=0.2.2=pyhd8ed1ab_0 + - debugpy=1.8.6=py310hf71b8c6_0 + - decorator=5.1.1=pyhd8ed1ab_0 + - exceptiongroup=1.2.2=pyhd8ed1ab_0 + - executing=2.1.0=pyhd8ed1ab_0 + - ipykernel=6.29.5=pyh3099207_0 + - ipython=8.27.0=pyh707e725_0 + - jedi=0.19.1=pyhd8ed1ab_0 + - jupyter_client=8.6.3=pyhd8ed1ab_0 + - jupyter_core=5.7.2=pyh31011fe_1 + - krb5=1.21.3=h143b758_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libedit=3.1.20230828=h5eee18b_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc=14.1.0=h77fa898_1 + - libgcc-ng=14.1.0=h69a702a_1 + - libgomp=14.1.0=h77fa898_1 + - libsodium=1.0.20=h4ab18f5_0 + - libstdcxx=14.1.0=hc0a3c3a_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - matplotlib-inline=0.1.7=pyhd8ed1ab_0 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.6.0=pyhd8ed1ab_0 + - openssl=3.3.2=hb9d3cd8_0 + - packaging=24.1=pyhd8ed1ab_0 + - parso=0.8.4=pyhd8ed1ab_0 + - pexpect=4.9.0=pyhd8ed1ab_0 + - pickleshare=0.7.5=py_1003 + - pip=24.2=py310h06a4308_0 + - platformdirs=4.3.6=pyhd8ed1ab_0 + - prompt-toolkit=3.0.48=pyha770c72_0 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pure_eval=0.2.3=pyhd8ed1ab_0 + - pygments=2.18.0=pyhd8ed1ab_0 + - python=3.10.14=h955ad1f_1 + - python_abi=3.10=2_cp310 + - pyzmq=26.2.0=py310h71f11fc_2 + - readline=8.2=h5eee18b_0 + - setuptools=75.1.0=py310h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - stack_data=0.6.2=pyhd8ed1ab_0 + - tk=8.6.14=h39e8969_0 + - tornado=6.4.1=py310ha75aee5_1 + - traitlets=5.14.3=pyhd8ed1ab_0 + - typing_extensions=4.12.2=pyha770c72_0 + - wcwidth=0.2.13=pyhd8ed1ab_0 + - wheel=0.44.0=py310h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zeromq=4.3.5=ha4adb4c_5 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.1.0 + - accelerate==0.33.0 + - aiofiles==23.2.1 + - aiohappyeyeballs==2.3.5 + - aiohttp==3.10.3 + - aiosignal==1.3.1 + - altgraph==0.17.4 + - annotated-types==0.7.0 + - anyio==4.6.0 + - astunparse==1.6.3 + - async-timeout==4.0.3 + - attrs==24.2.0 + - av==12.0.0 + - backports-tarfile==1.2.0 + - beautifulsoup4==4.12.3 + - build==1.2.2 + - cachetools==5.5.0 + - certifi==2024.7.4 + - cffi==1.17.1 + - charset-normalizer==3.3.2 + - clean-fid==0.1.35 + - click==8.1.7 + - colorama==0.4.6 + - contextlib2==21.6.0 + - contourpy==1.2.1 + - cryptography==43.0.1 + - cycler==0.12.1 + - datasets==2.21.0 + - diffusers==0.27.2 + - dill==0.3.8 + - docker-pycreds==0.4.0 + - docutils==0.21.2 + - fastapi==0.115.0 + - ffmpy==0.4.0 + - filelock==3.15.4 + - flatbuffers==24.3.25 + - fonttools==4.53.1 + - frozenlist==1.4.1 + - fsspec==2024.6.1 + - gast==0.4.0 + - gdown==5.2.0 + - gitdb==4.0.11 + - gitpython==3.1.43 + - google-auth==2.35.0 + - google-auth-oauthlib==0.4.6 + - google-pasta==0.2.0 + - gradio==4.44.0 + - gradio-client==1.3.0 + - grpcio==1.65.4 + - h11==0.14.0 + - h5py==3.11.0 + - httpcore==1.0.6 + - httpx==0.27.2 + - huggingface-hub==0.25.2 + - idna==3.7 + - imageio==2.35.0 + - importlib-metadata==8.2.0 + - importlib-resources==6.4.5 + - jaraco-classes==3.4.0 + - jaraco-context==6.0.1 + - jaraco-functools==4.1.0 + - jeepney==0.8.0 + - jinja2==3.1.4 + - joblib==1.4.2 + - json-with-comments==1.2.7 + - keras==3.5.0 + - keras-preprocessing==1.1.2 + - keyring==25.4.1 + - kiwisolver==1.4.5 + - kornia==0.7.4 + - kornia-rs==0.1.7 + - lazy-loader==0.4 + - libclang==18.1.1 + - markdown==3.6 + - markdown-it-py==3.0.0 + - markupsafe==2.1.5 + - matplotlib==3.9.2 + - mdurl==0.1.2 + - ml-collections==0.1.1 + - ml-dtypes==0.4.0 + - more-itertools==10.5.0 + - multidict==6.0.5 + - multiprocess==0.70.16 + - namex==0.0.8 + - networkx==3.3 + - nh3==0.2.18 + - nltk==3.9.1 + - numpy==1.26.4 + - nvidia-cublas-cu11==11.10.3.66 + - nvidia-cuda-nvrtc-cu11==11.7.99 + - nvidia-cuda-runtime-cu11==11.7.99 + - nvidia-cudnn-cu11==8.5.0.96 + - oauthlib==3.2.2 + - opencv-python==4.10.0.84 + - opencv-python-headless==4.10.0.84 + - opt-einsum==3.3.0 + - optree==0.12.1 + - orjson==3.10.7 + - pandas==2.2.2 + - pillow==10.4.0 + - pkginfo==1.10.0 + - plotly==5.24.1 + - protobuf==4.25.5 + - psutil==5.9.8 + - pyarrow==17.0.0 + - pyasn1==0.6.1 + - pyasn1-modules==0.4.1 + - pycparser==2.22 + - pydantic==2.9.2 + - pydantic-core==2.23.4 + - pydub==0.25.1 + - pyinstaller==6.10.0 + - pyinstaller-hooks-contrib==2024.8 + - pyparsing==3.1.2 + - pyproject-hooks==1.1.0 + - pysocks==1.7.1 + - python-dateutil==2.9.0.post0 + - python-multipart==0.0.12 + - pytorch-msssim==1.0.0 + - pytorchcv==0.0.73 + - pytz==2023.3.post1 + - pyyaml==6.0.2 + - readme-renderer==44.0 + - regex==2024.7.24 + - requests==2.32.3 + - requests-oauthlib==2.0.0 + - requests-toolbelt==1.0.0 + - rfc3986==2.0.0 + - rich==13.7.1 + - rsa==4.9 + - ruff==0.6.9 + - safetensors==0.4.4 + - saliency==0.2.1 + - scikit-image==0.24.0 + - scikit-learn==1.6.0 + - scipy==1.14.0 + - secretstorage==3.3.3 + - semantic-version==2.10.0 + - sentence-transformers==3.3.1 + - sentry-sdk==2.15.0 + - setproctitle==1.3.3 + - shapely==2.0.5 + - shellingham==1.5.4 + - six==1.12.0 + - smmap==5.0.1 + - sniffio==1.3.1 + - soupsieve==2.6 + - spaces==0.30.2 + - starlette==0.38.6 + - tenacity==9.0.0 + - tensorboard==2.17.1 + - tensorboard-data-server==0.7.2 + - tensorboard-plugin-wit==1.8.1 + - tensorflow==2.17.0 + - tensorflow-estimator==2.10.0 + - tensorflow-hub==0.16.1 + - tensorflow-intel==0.0.1 + - tensorflow-io-gcs-filesystem==0.31.0 + - termcolor==1.1.0 + - tf-keras==2.17.0 + - threadpoolctl==3.5.0 + - tifffile==2024.8.10 + - timm==1.0.10 + - tokenizers==0.19.1 + - tomli==2.0.1 + - tomlkit==0.12.0 + - torch==1.13.1 + - torchvision==0.14.1 + - tqdm==4.66.5 + - transformers==4.43.3 + - twine==5.1.1 + - typer==0.12.5 + - tzdata==2024.1 + - urllib3==2.2.2 + - uvicorn==0.31.0 + - wandb==0.18.3 + - websockets==12.0 + - werkzeug==3.0.4 + - wrapt==1.11.2 + - xxhash==3.4.1 + - yarl==1.9.4 + - zipp==3.20.0 +prefix: /home/ashhar21137/miniconda3/envs/panda diff --git a/metrics/distortion.py b/metrics/distortion.py new file mode 100644 index 0000000000000000000000000000000000000000..823724c5e0aaa928d655b4861374c87d650b576c --- /dev/null +++ b/metrics/distortion.py @@ -0,0 +1,370 @@ +import os +import sys +from tqdm import tqdm +import numpy as np +import torch +import matplotlib.pyplot as plt +from transformers import GPT2LMHeadModel, GPT2TokenizerFast +from bert_score import BERTScorer +from bert_score.utils import model2layers +from nltk.tokenize import word_tokenize +from Levenshtein import distance as levenshtein_distance +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import TfidfVectorizer +from scipy.spatial.distance import cdist +from scipy.optimize import linear_sum_assignment +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from config.config import load_config +config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml') +config = load_config(config_path)['PECCAVI_TEXT']['Metrics'] + +class SentenceDistortionCalculator: + """ + A class to calculate and analyze distortion metrics between an original sentence and modified sentences. + """ + def __init__(self, config, original_sentence, paraphrased_sentences): + """ + Initialize the calculator with the original sentence and a list of modified sentences. + """ + self.original_sentence = original_sentence + self.paraphrased_sentences = paraphrased_sentences + + self.levenshtein_distances = {} + self.bert_scores = {} + self.mover_scores = {} + + self.normalized_levenshtein = {} + self.normalized_bert_scores = {} + self.normalized_mover_scores = {} + self.combined_distortions = {} + + self.tokenizer = GPT2TokenizerFast.from_pretrained(config['Distortion']) + self.model = GPT2LMHeadModel.from_pretrained(config['Distortion']) + self.model.eval() + + def calculate_all_metrics(self): + """ + Calculate all distortion metrics for each modified sentence. + """ + for idx, modified_sentence in tqdm(enumerate(self.paraphrased_sentences), total=len(self.paraphrased_sentences), desc="Calculating Metrics"): + key = f"Sentence_{idx+1}" + self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence) + self.bert_scores[key] = self._calculate_bert_score(modified_sentence) + self.mover_scores[key] = self._calculate_mover_score(modified_sentence) + + + def normalize_metrics(self): + """ + Normalize all metrics to be between 0 and 1. + """ + for _ in tqdm(range(1), desc="Normalizing Metrics"): # Add tqdm here (wrap the normalization process) + self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances) + self.normalized_bert_scores = self._normalize_dict(self.bert_scores) + self.normalized_mover_scores = self._normalize_dict(self.mover_scores) + + def calculate_combined_distortion(self): + """ + Calculate the combined distortion using the root mean square of the normalized metrics. + """ + for _ in tqdm(range(1), desc="Calculating Combined Distortion"): # Add tqdm here + for key in self.normalized_levenshtein.keys(): + rms = np.sqrt( + ( + self.normalized_levenshtein[key] ** 2 + + self.normalized_bert_scores[key] ** 2+ + self.normalized_mover_scores[key] **2 + ) / 3 + ) + self.combined_distortions[key] = rms + + def plot_metrics(self): + """ + Plot each normalized metric and the combined distortion in separate graphs. + """ + keys = list(self.normalized_levenshtein.keys()) + indices = np.arange(len(keys)) + + # Prepare data for plotting + metrics = { + 'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys], + 'BERTScore': [self.normalized_bert_scores[key] for key in keys], + 'MOVERscore':[self.normalized_mover_scores[key] for key in keys], + 'Combined Distortion': [self.combined_distortions[key] for key in keys] + } + + # Plot each metric separately + for metric_name, values in tqdm(metrics.items(), desc="Plotting Metrics"): # Add tqdm here + plt.figure(figsize=(12, 6)) + plt.plot(indices, values, marker='o', color=np.random.rand(3,)) + plt.xlabel('Sentence Index') + plt.ylabel('Normalized Value (0-1)') + plt.title(f'Normalized {metric_name}') + plt.grid(True) + plt.tight_layout() + plt.show() + + def _calculate_levenshtein_distance(self, modified_sentence): + """ + Calculate the word-level Levenshtein distance between the original and modified sentence. + """ + words1 = word_tokenize(self.original_sentence) + words2 = word_tokenize(modified_sentence) + lev_distance = levenshtein_distance(words1, words2) + return (lev_distance / max(len(words1), len(words2))) + + def _calculate_bert_score(self, modified_sentence): + """ + Compute the BERTScore similarity between the original and modified sentence. + Returns 1 - F1 score to represent dissimilarity. + """ + if not hasattr(self, 'original_sentence'): + raise ValueError("original_sentence is not set. Please set self.original_sentence before calling this function.") + if not isinstance(modified_sentence, str): + raise ValueError("modified_sentence must be a string.") + + model_type = "microsoft/deberta-xlarge-mnli" + num_layers = model2layers[model_type] + + if not hasattr(self, "cached_bertscorer"): + self.cached_bertscorer = BERTScorer( + model_type=model_type, + num_layers=num_layers, + batch_size=1, # Single sentence comparison + nthreads=4, + all_layers=False, + idf=False, + device="cuda" if torch.cuda.is_available() else "cpu", + lang="en" + ) + + # Compute BERTScore + _, _, F1 = self.cached_bertscorer.score( + cands=[modified_sentence], + refs=[self.original_sentence], + verbose=False, + batch_size=1 + ) + + return 1 - F1.item() # Return dissimilarity score + def _calculate_mover_score(self,modified_sentence,model_name='all-MiniLM-L6-v2'): + """Compute MoverScore correctly using word-level embeddings.""" + if not self.original_sentence: + raise ValueError("Original sentence not provided.") + + # Tokenize sentences + original_tokens = self.original_sentence.split() + modified_tokens = modified_sentence.split() + model = SentenceTransformer(model_name) + + # Compute word embeddings + original_embeddings = model.encode(original_tokens, convert_to_numpy=True) + modified_embeddings = model.encode(modified_tokens, convert_to_numpy=True) + + # Compute cost matrix (cosine distance) + cost_matrix = cdist(original_embeddings, modified_embeddings, metric='cosine') + + # Solve optimal transport problem (Hungarian Algorithm) + row_ind, col_ind = linear_sum_assignment(cost_matrix) + + # Compute IDF weights + vectorizer = TfidfVectorizer() + vectorizer.fit([self.original_sentence, modified_sentence]) + idf_values = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)) + + # Apply IDF weighting to aligned word pairs + idf_weights_original = np.array([idf_values.get(word.lower(), 1.0) for word in original_tokens]) + idf_weights_modified = np.array([idf_values.get(word.lower(), 1.0) for word in modified_tokens]) + combined_idf_weights = (idf_weights_original[row_ind] + idf_weights_modified[col_ind]) / 2 + weighted_score = np.sum((1 - cost_matrix[row_ind, col_ind]) * combined_idf_weights) / np.sum(combined_idf_weights) + + return 1-weighted_score # Higher score = more dissimilar + + def _normalize_dict(self, metric_dict): + """ + Normalize the values in a dictionary to be between 0 and 1. + """ + values = np.array(list(metric_dict.values())) + min_val = values.min() + max_val = values.max() + if max_val - min_val == 0: + normalized_values = np.zeros_like(values) + else: + normalized_values = (values - min_val) / (max_val - min_val) + return dict(zip(metric_dict.keys(), normalized_values)) + + def get_normalized_metrics(self): + """ + Get all normalized metrics as a dictionary. + """ + return { + 'Min Edit Distance': self.normalized_levenshtein, + 'BERTScore': self.normalized_bert_scores, + 'Mover Score': self.normalized_mover_scores + } + + def get_combined_distortions(self): + """ + Get the dictionary of combined distortion values. + """ + return self.combined_distortions + +# Example usage +if __name__ == "__main__": + + config = load_config(config_path)['PECCAVI_TEXT']['Metrics'] + + # Original sentence + original_sentence = "The quick brown fox jumps over the lazy dog" + + # Paraphrased sentences + paraphrased_sentences = [ + # Original 1: "A swift auburn fox leaps across a sleepy canine." + "The swift auburn fox leaps across a sleepy canine.", + "A quick auburn fox leaps across a sleepy canine.", + "A swift ginger fox leaps across a sleepy canine.", + "A swift auburn fox bounds across a sleepy canine.", + "A swift auburn fox leaps across a tired canine.", + "Three swift auburn foxes leap across a sleepy canine.", + "The vulpine specimen rapidly traverses over a dormant dog.", + "Like lightning, the russet hunter soars over the drowsy guardian.", + "Tha quick ginger fox jumps o'er the lazy hound, ye ken.", + "One rapid Vulpes vulpes traverses the path of a quiescent canine.", + "A swift auburn predator navigates across a lethargic pet.", + "Subject A (fox) demonstrates velocity over Subject B (dog).", + + # Original 2: "The agile russet fox bounds over an idle hound." + "Some agile russet foxes bound over an idle hound.", + "The nimble russet fox bounds over an idle hound.", + "The agile brown fox bounds over an idle hound.", + "The agile russet fox jumps over an idle hound.", + "The agile russet fox bounds over a lazy hound.", + "Two agile russet foxes bound over an idle hound.", + "A dexterous vulpine surpasses a stationary canine.", + "Quick as thought, the copper warrior sails over the guardian.", + "Tha nimble reddish fox jumps o'er the doggo, don't ya know.", + "A dexterous V. vulpes exceeds the plane of an inactive canine.", + "An agile russet hunter maneuvers above a resting hound.", + "Test subject F-1 achieves displacement superior to subject D-1.", + + # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog." + "The nimble mahogany vulpine vaults above a drowsy dog.", + "A swift mahogany vulpine vaults above a drowsy dog.", + "A nimble reddish vulpine vaults above a drowsy dog.", + "A nimble mahogany fox vaults above a drowsy dog.", + "A nimble mahogany vulpine leaps above a drowsy dog.", + "Four nimble mahogany vulpines vault above a drowsy dog.", + "An agile specimen of reddish fur surpasses a somnolent canine.", + "Fleet as wind, the earth-toned hunter soars over the sleepy guard.", + "Tha quick brown beastie jumps o'er the tired pup, aye.", + "Single V. vulpes demonstrates vertical traverse over C. familiaris.", + "A nimble rust-colored predator crosses above a drowsy pet.", + "Observed: Subject Red executes vertical motion over Subject Gray.", + + # Original 4: "The speedy copper-colored fox hops over the lethargic pup." + "A speedy copper-colored fox hops over the lethargic pup.", + "The quick copper-colored fox hops over the lethargic pup.", + "The speedy bronze fox hops over the lethargic pup.", + "The speedy copper-colored fox jumps over the lethargic pup.", + "The speedy copper-colored fox hops over the tired pup.", + "Multiple speedy copper-colored foxes hop over the lethargic pup.", + "A rapid vulpine of bronze hue traverses an inactive young canine.", + "Swift as a dart, the metallic hunter bounds over the lazy puppy.", + "Tha fast copper beastie leaps o'er the sleepy wee dog.", + "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.", + "A fleet copper-toned predator moves past a sluggish young dog.", + "Field note: Adult fox subject exceeds puppy subject vertically.", + + # Original 5: "A rapid tawny fox springs over a sluggish dog." + "The rapid tawny fox springs over a sluggish dog.", + "A quick tawny fox springs over a sluggish dog.", + "A rapid golden fox springs over a sluggish dog.", + "A rapid tawny fox jumps over a sluggish dog.", + "A rapid tawny fox springs over a lazy dog.", + "Six rapid tawny foxes spring over a sluggish dog.", + "An expeditious yellowish vulpine surpasses a torpid canine.", + "Fast as a bullet, the golden hunter vaults over the idle guard.", + "Tha swift yellowy fox jumps o'er the lazy mutt, aye.", + "One V. vulpes displays rapid transit over one inactive C. familiaris.", + "A speedy yellow-brown predator bypasses a motionless dog.", + "Log entry: Vulpine subject achieves swift vertical displacement.", + + # Original 6: "The fleet-footed chestnut fox soars above an indolent canine." + "A fleet-footed chestnut fox soars above an indolent canine.", + "The swift chestnut fox soars above an indolent canine.", + "The fleet-footed brown fox soars above an indolent canine.", + "The fleet-footed chestnut fox leaps above an indolent canine.", + "The fleet-footed chestnut fox soars above a lazy canine.", + "Several fleet-footed chestnut foxes soar above an indolent canine.", + "A rapid brown vulpine specimen traverses a lethargic domestic dog.", + "Graceful as a bird, the nutbrown hunter flies over the lazy guard.", + "Tha quick brown beastie sails o'er the sleepy hound, ken.", + "Single agile V. vulpes achieves elevation above stationary canine.", + "A nimble brown predator glides over an unmoving domestic animal.", + "Research note: Brown subject displays superior vertical mobility.", + + # Original 7: "A fast ginger fox hurdles past a slothful dog." + "The fast ginger fox hurdles past a slothful dog.", + "A quick ginger fox hurdles past a slothful dog.", + "A fast red fox hurdles past a slothful dog.", + "A fast ginger fox jumps past a slothful dog.", + "A fast ginger fox hurdles past a lazy dog.", + "Five fast ginger foxes hurdle past a slothful dog.", + "A rapid orange vulpine bypasses a lethargic canine.", + "Quick as lightning, the flame-colored hunter races past the lazy guard.", + "Tha swift ginger beastie leaps past the tired doggy, ye see.", + "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.", + "A speedy red-orange predator overtakes a motionless dog.", + "Data point: Orange subject demonstrates rapid transit past Gray subject.", + + # Original 8: "The spry rusty-colored fox jumps across a dozing hound." + "A spry rusty-colored fox jumps across a dozing hound.", + "The agile rusty-colored fox jumps across a dozing hound.", + "The spry reddish fox jumps across a dozing hound.", + "The spry rusty-colored fox leaps across a dozing hound.", + "The spry rusty-colored fox jumps across a sleeping hound.", + "Multiple spry rusty-colored foxes jump across a dozing hound.", + "An agile rust-toned vulpine traverses a somnolent canine.", + "Nimble as thought, the copper hunter bounds over the resting guard.", + "Tha lively rust-colored beastie hops o'er the snoozin' hound.", + "Single dexterous V. vulpes crosses path of dormant C. familiaris.", + "A lithe rust-tinted predator moves past a slumbering dog.", + "Observation: Russet subject exhibits agility over dormant subject.", + + # Original 9: "A quick tan fox leaps over an inactive dog." + "The quick tan fox leaps over an inactive dog.", + "A swift tan fox leaps over an inactive dog.", + "A quick beige fox leaps over an inactive dog.", + "A quick tan fox jumps over an inactive dog.", + "A quick tan fox leaps over a motionless dog.", + "Seven quick tan foxes leap over an inactive dog.", + "A rapid light-brown vulpine surpasses a stationary canine.", + "Fast as wind, the sand-colored hunter soars over the still guard.", + "Tha nimble tan beastie jumps o'er the quiet doggy, aye.", + "One agile fawn V. vulpes traverses one immobile C. familiaris.", + "A fleet tan-colored predator bypasses an unmoving dog.", + "Field report: Tan subject demonstrates movement over static subject.", + + # Original 10: "The brisk auburn vulpine bounces over a listless canine." + "Some brisk auburn vulpines bounce over a listless canine.", + "The quick auburn vulpine bounces over a listless canine.", + "The brisk russet vulpine bounces over a listless canine.", + "The brisk auburn fox bounces over a listless canine.", + "The brisk auburn vulpine jumps over a listless canine.", + "Five brisk auburn vulpines bounce over a listless canine.", + "The expeditious specimen supersedes a quiescent Canis lupus.", + "Swift as wind, the russet hunter vaults over the idle guardian.", + "Tha quick ginger beastie hops o'er the lazy mutt, aye.", + "One V. vulpes achieves displacement over inactive C. familiaris.", + "A high-velocity auburn predator traverses an immobile animal.", + "Final observation: Red subject shows mobility over Gray subject." + ] + + distortion_calculator = SentenceDistortionCalculator(config, original_sentence, paraphrased_sentences) + for _ in tqdm(range(1)): + distortion_calculator.calculate_all_metrics() + distortion_calculator.normalize_metrics() + distortion_calculator.calculate_combined_distortion() + distortion_calculator.plot_metrics() + print("Normalized Metrics:", distortion_calculator.get_normalized_metrics()) + print("Combined Distortion:", distortion_calculator.get_combined_distortions()) \ No newline at end of file diff --git a/renderers/__pycache__/highlighter.cpython-310.pyc b/renderers/__pycache__/highlighter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..510ec095660081fa4720670c142da6842e77f377 Binary files /dev/null and b/renderers/__pycache__/highlighter.cpython-310.pyc differ diff --git a/renderers/__pycache__/highlighter.cpython-311.pyc b/renderers/__pycache__/highlighter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ab39a71b59353506700760768ad5c3f09fe3f8b Binary files /dev/null and b/renderers/__pycache__/highlighter.cpython-311.pyc differ diff --git a/renderers/__pycache__/plot_3d.cpython-310.pyc b/renderers/__pycache__/plot_3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e793dc94a8945951cce597a5439ad5512dec6bb9 Binary files /dev/null and b/renderers/__pycache__/plot_3d.cpython-310.pyc differ diff --git a/renderers/__pycache__/plot_3d.cpython-311.pyc b/renderers/__pycache__/plot_3d.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dae5f8085f8aefeeaf78f7d81ce31f8f607ed9d Binary files /dev/null and b/renderers/__pycache__/plot_3d.cpython-311.pyc differ diff --git a/renderers/__pycache__/tree.cpython-310.pyc b/renderers/__pycache__/tree.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..233737f1a6b7ee72033417aafb41980c35f7cbde Binary files /dev/null and b/renderers/__pycache__/tree.cpython-310.pyc differ diff --git a/renderers/__pycache__/tree.cpython-311.pyc b/renderers/__pycache__/tree.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fc03ee206c2a9ff0fc0efdc5d64287e521a2e3a Binary files /dev/null and b/renderers/__pycache__/tree.cpython-311.pyc differ diff --git a/renderers/highlighter.py b/renderers/highlighter.py new file mode 100644 index 0000000000000000000000000000000000000000..6f09fe17ea7cd4d5f0261e4dc416bd9f7b66ae8b --- /dev/null +++ b/renderers/highlighter.py @@ -0,0 +1,162 @@ +import re + +def highlight_common_words(common_words, sentences, title): + """ + Highlight common words in sentences by adding color-coded background and unique IDs. + + Args: + common_words (list of tuples): List of tuples where each tuple contains a word's index and the word. + sentences (list of str): List of sentences to search through. + title (str): The title for the HTML output. + + Returns: + str: HTML string with the highlighted sentences. + """ + color_map = {} + color_index = 0 + highlighted_html = [] + + # Process each sentence + for idx, sentence in enumerate(sentences, start=1): + sentence_with_idx = f"{idx}. {sentence}" + highlighted_sentence = sentence_with_idx + + # Highlight common words in each sentence + for index, word in common_words: + if word not in color_map: + color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)' + color_index += 1 + + # Escape word and create regex pattern to match whole word + escaped_word = re.escape(word) + pattern = rf'\b{escaped_word}\b' + + # Replace the word with highlighted version + highlighted_sentence = re.sub( + pattern, + lambda m, idx=index, color=color_map[word]: ( + f'' + f'{idx}' + f'{m.group(0)}' + f'' + ), + highlighted_sentence, + flags=re.IGNORECASE + ) + + highlighted_html.append(highlighted_sentence) + + # Format the HTML output with the title + final_html = "

".join(highlighted_html) + return f''' +
+

{title}

+
{final_html}
+
+ ''' + +def highlight_common_words_dict(common_words, sentences, title): + """ + Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs. + + Args: + common_words (list of tuples): List of tuples where each tuple contains a word's index and the word. + sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score. + title (str): The title for the HTML output. + + Returns: + str: HTML string with the highlighted sentences and their entailment scores. + """ + color_map = {} + color_index = 0 + highlighted_html = [] + + # Process each sentence and its score + for idx, (sentence, score) in enumerate(sentences.items(), start=1): + sentence_with_idx = f"{idx}. {sentence}" + highlighted_sentence = sentence_with_idx + + # Highlight common words in each sentence + for index, word in common_words: + if word not in color_map: + color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)' + color_index += 1 + + # Escape word and create regex pattern to match whole word + escaped_word = re.escape(word) + pattern = rf'\b{escaped_word}\b' + + # Replace the word with highlighted version + highlighted_sentence = re.sub( + pattern, + lambda m, idx=index, color=color_map[word]: ( + f'' + f'{idx}' + f'{m.group(0)}' + f'' + ), + highlighted_sentence, + flags=re.IGNORECASE + ) + + # Add the entailment score + highlighted_html.append( + f'
' + f'{highlighted_sentence}' + f'
' + f'Entailment Score: {score}
' + ) + + # Format the HTML output with the title + final_html = "
".join(highlighted_html) + return f''' +
+

{title}

+
{final_html}
+
+ ''' + +def reparaphrased_sentences_html(sentences): + """ + Create an HTML representation of sentences with numbering. + + Args: + sentences (list of str): List of sentences to format. + + Returns: + str: HTML string with numbered sentences. + """ + formatted_sentences = [] + + # Process each sentence + for idx, sentence in enumerate(sentences, start=1): + sentence_with_idx = f"{idx}. {sentence}" + formatted_sentences.append(sentence_with_idx) + + # Format the HTML output + final_html = "

".join(formatted_sentences) + return f''' +
+
{final_html}
+
+ ''' + +if __name__ == "__main__": + # Example usage + common_words = [(1, "highlight"), (2, "numbering")] + sentences = ["This is a test to highlight words.", "Numbering is important for clarity."] + + # Test highlight_common_words + highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting") + print(highlighted_html) + + # Test highlight_common_words_dict + sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8} + highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting") + print(highlighted_html_dict) diff --git a/renderers/plot_3d.py b/renderers/plot_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..3355c77b307fd95129c37ff5e5ae5d3de15751c1 --- /dev/null +++ b/renderers/plot_3d.py @@ -0,0 +1,126 @@ +""" +This file contains the code to plot a 3d tree +""" +import numpy as np +import plotly.graph_objects as go +from scipy.interpolate import griddata + +def gen_three_D_plot(detectability_val, distortion_val, euclidean_val): + """ + Generates a 3D surface plot showing the relationship between detectability, distortion, + and Euclidean distance, with a focus on highlighting the "sweet spot" based on a composite score. + + The function takes three sets of values: detectability, distortion, and Euclidean distance, + normalizes them to a [0, 1] range, and computes a composite score that combines these three metrics. + The "sweet spot" is the point where the composite score is maximized. This sweet spot is plotted + as a red marker on the 3D surface plot. + + The function then uses a grid interpolation method (`griddata`) to generate a smooth surface + for the Euclidean distance over the detectability and distortion values. The result is a surface plot + where the contours represent different Euclidean distances. + + Args: + detectability_val (list or array): A list or array of detectability scores. + distortion_val (list or array): A list or array of distortion scores. + euclidean_val (list or array): A list or array of Euclidean distances. + + Returns: + plotly.graph_objects.Figure: A Plotly figure object representing the 3D surface plot, + with contour lines and a marker for the sweet spot. + + Raises: + ValueError: If `griddata` fails to generate a valid interpolation, which could happen if the + input data does not allow for a proper interpolation. + + Example: + # Example of usage: + detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9] + distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0] + euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6] + + fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals) + fig.show() # Displays the plot in a web browser + + Notes: + - The composite score is calculated as: + `composite_score = norm_detectability - (norm_distortion + norm_euclidean)`, + where the goal is to maximize detectability and minimize distortion and Euclidean distance. + - The `griddata` function uses linear interpolation to create a smooth surface for the plot. + - The function uses the "Plasma" colorscale for the surface plot, which provides a perceptually uniform color scheme. + """ + + detectability = np.array(detectability_val) + distortion = np.array(distortion_val) + euclidean = np.array(euclidean_val) + + # Normalize the values to range [0, 1] + norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability)) + norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion)) + norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean)) + + # Composite score: maximize detectability, minimize distortion and Euclidean distance + composite_score = norm_detectability - (norm_distortion + norm_euclidean) + + # Find the index of the maximum score (sweet spot) + sweet_spot_index = np.argmax(composite_score) + + # Sweet spot values + sweet_spot_detectability = detectability[sweet_spot_index] + sweet_spot_distortion = distortion[sweet_spot_index] + sweet_spot_euclidean = euclidean[sweet_spot_index] + + # Create a meshgrid from the data + x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30), + np.linspace(min(distortion), max(distortion), 30)) + + # Interpolate z values (Euclidean distances) to fit the grid using 'nearest' method + z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='nearest') + + if z_grid is None: + raise ValueError("griddata could not generate a valid interpolation. Check your input data.") + + # Create the 3D contour plot with the Plasma color scale + fig = go.Figure(data=go.Surface( + z=z_grid, + x=x_grid, + y=y_grid, + contours={ + "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True} + }, + colorscale='Plasma' + )) + + # Add a marker for the sweet spot + fig.add_trace(go.Scatter3d( + x=[sweet_spot_detectability], + y=[sweet_spot_distortion], + z=[sweet_spot_euclidean], + mode='markers+text', + marker=dict(size=10, color='red', symbol='circle'), + text=["Sweet Spot"], + textposition="top center" + )) + + # Set axis labels + fig.update_layout( + scene=dict( + xaxis_title='Detectability Score', + yaxis_title='Distortion Score', + zaxis_title='Euclidean Distance' + ), + margin=dict(l=0, r=0, b=0, t=0) + ) + + return fig + +if __name__ == "__main__": + # Example input data + detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9] + distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0] + euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6] + + # Call the function with example data + fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals) + + # Show the plot + fig.show() \ No newline at end of file diff --git a/renderers/tree.py b/renderers/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec281b0238bb808ebb84e920c8191022daa194e --- /dev/null +++ b/renderers/tree.py @@ -0,0 +1,490 @@ +import plotly.graph_objects as go +import textwrap +import re +from collections import defaultdict + +def generate_subplot1(paraphrased_sentence, masked_sentences, strategies, highlight_info, common_grams): + """ + Generates a subplot visualizing paraphrased and masked sentences in a tree structure. + Highlights common words with specific colors and applies Longest Common Subsequence (LCS) numbering. + + Args: + paraphrased_sentence (str): The paraphrased sentence to be visualized. + masked_sentences (list of str): A list of masked sentences to be visualized. + strategies (list of str, optional): List of strategies used for each masked sentence. + highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting. + common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering. + + Returns: + plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges. + """ + # Combine nodes into one list with appropriate labels + if isinstance(masked_sentences, str): + masked_sentences = [masked_sentences] + nodes = [paraphrased_sentence] + masked_sentences + nodes[0] += ' L0' # Paraphrased sentence is level 0 + if len(nodes) < 2: + print("[ERROR] Insufficient nodes for visualization") + return go.Figure() + + for i in range(1, len(nodes)): + nodes[i] += ' L1' # masked sentences are level 1 + + def apply_lcs_numbering(sentence, common_grams): + """ + Applies LCS numbering to the sentence based on the common_grams. + + Args: + sentence (str): The sentence to which the LCS numbering should be applied. + common_grams (list of tuples): A list of common grams to be replaced with LCS numbers. + + Returns: + str: The sentence with LCS numbering applied. + """ + for idx, lcs in common_grams: + sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence) + return sentence + + # Apply LCS numbering + nodes = [apply_lcs_numbering(node, common_grams) for node in nodes] + + + def highlight_words(sentence, color_map): + """ + Highlights words in the sentence based on the color_map. + + Args: + sentence (str): The sentence where the words will be highlighted. + color_map (dict): A dictionary mapping words to their colors. + + Returns: + str: The sentence with highlighted words. + """ + for word, color in color_map.items(): + sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE) + return sentence + + # Clean and wrap nodes, and highlight specified words globally + cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes] + global_color_map = dict(highlight_info) + highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes] + wrapped_nodes = ['
'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes] + + def get_levels_and_edges(nodes, strategies=None): + """ + Determines tree levels and creates edges dynamically. + + Args: + nodes (list of str): The nodes representing the sentences. + strategies (list of str, optional): The strategies used for each edge. + + Returns: + tuple: A tuple containing two dictionaries: + - levels: A dictionary mapping node indices to their levels. + - edges: A list of edges where each edge is represented by a tuple of node indices. + """ + levels = {} + edges = [] + for i, node in enumerate(nodes): + level = int(node.split()[-1][1]) + levels[i] = level + + # Add edges from L0 to all L1 nodes + root_node = next((i for i, level in levels.items() if level == 0), 0) + for i, level in levels.items(): + if level == 1: + edges.append((root_node, i)) + + return levels, edges + + # Get levels and dynamic edges + levels, edges = get_levels_and_edges(nodes, strategies) + max_level = max(levels.values(), default=0) + + # Calculate positions + positions = {} + level_heights = defaultdict(int) + for node, level in levels.items(): + level_heights[level] += 1 + + y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()} + x_gap = 2 + l1_y_gap = 10 + + for node, level in levels.items(): + if level == 1: + positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap) + else: + positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap) + y_offsets[level] += 1 + + def color_highlighted_words(node, color_map): + """ + Colors the highlighted words in the node text. + + Args: + node (str): The node text to be highlighted. + color_map (dict): A dictionary mapping words to their colors. + + Returns: + str: The node text with highlighted words. + """ + parts = re.split(r'(\{\{.*?\}\})', node) + colored_parts = [] + for part in parts: + match = re.match(r'\{\{(.*?)\}\}', part) + if match: + word = match.group(1) + color = color_map.get(word, 'black') + colored_parts.append(f"{word}") + else: + colored_parts.append(part) + return ''.join(colored_parts) + + # Define the text for each edge + default_edge_texts = [ + "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking", + "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling", + "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling", + "Exponential Minimum Sampling", "Inverse Transform Sampling", "Greedy Sampling", + "Temperature Sampling", "Exponential Minimum Sampling", "Inverse Transform Sampling" + ] + + if len(nodes) < 2: + print("[ERROR] Insufficient nodes for visualization") + return go.Figure() + + # Create figure + fig1 = go.Figure() + + # Add nodes to the figure + for i, node in enumerate(wrapped_nodes): + colored_node = color_highlighted_words(node, global_color_map) + x, y = positions[i] + fig1.add_trace(go.Scatter( + x=[-x], # Reflect the x coordinate + y=[y], + mode='markers', + marker=dict(size=20, color='blue', line=dict(color='black', width=2)), + hoverinfo='none' + )) + fig1.add_annotation( + x=-x, # Reflect the x coordinate + y=y, + text=colored_node, + showarrow=False, + xshift=15, + align="center", + font=dict(size=12), + bordercolor='black', + borderwidth=2, + borderpad=4, + bgcolor='white', + width=400, + height=100 + ) + + # Add edges and text above each edge + for i, edge in enumerate(edges): + x0, y0 = positions[edge[0]] + x1, y1 = positions[edge[1]] + + # Use strategy if available, otherwise use default edge text + if strategies and i < len(strategies): + edge_text = strategies[i] + else: + edge_text = default_edge_texts[i % len(default_edge_texts)] + + fig1.add_trace(go.Scatter( + x=[-x0, -x1], # Reflect the x coordinates + y=[y0, y1], + mode='lines', + line=dict(color='black', width=1) + )) + + # Calculate the midpoint of the edge + mid_x = (-x0 + -x1) / 2 + mid_y = (y0 + y1) / 2 + + # Adjust y position to shift text upwards + text_y_position = mid_y + 0.8 # Increase this value to shift the text further upwards + + # Add text annotation above the edge + fig1.add_annotation( + x=mid_x, + y=text_y_position, + text=edge_text, # Use the text specific to this edge + showarrow=False, + font=dict(size=12), + align="center" + ) + + fig1.update_layout( + showlegend=False, + margin=dict(t=50, b=50, l=50, r=50), + xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + width=800 + max_level * 200, # Adjusted width to accommodate more levels + height=300 + len(nodes) * 100, # Adjusted height to accommodate more levels + plot_bgcolor='rgba(240,240,240,0.2)', + paper_bgcolor='white' + ) + + return fig1 + +def generate_subplot2(masked_sentences, sampled_sentences, highlight_info, common_grams): + """ + Generates a subplot visualizing multiple masked sentences and their sampled variants in a tree structure. + Each masked sentence will have multiple sampled sentences derived from it using different sampling techniques. + + Args: + masked_sentences (list of str): A list of masked sentences to be visualized as root nodes. + sampled_sentences (list of str): A list of sampled sentences derived from masked sentences. + highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting. + common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering. + + Returns: + plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges. + """ + # Define sampling techniques + sampling_techniques = [ + "Greedy Sampling", + "Temperature Sampling", + "Exponential Minimum Sampling", + "Inverse Transform Sampling" + ] + + # Calculate total number of nodes + num_masked = len(masked_sentences) + num_sampled_per_masked = len(sampling_techniques) + total_nodes = num_masked + (num_masked * num_sampled_per_masked) + + # Combine all sentences into nodes list with appropriate labels + nodes = [] + # Level 0: masked sentences (root nodes) + nodes.extend([s + ' L0' for s in masked_sentences]) + + # Level 1: sampled sentences (branch nodes) + # For each masked sentence, we should have samples from each technique + sampled_nodes = [] + + # Validate if we have the expected number of sampled sentences + expected_sampled_count = num_masked * num_sampled_per_masked + if len(sampled_sentences) < expected_sampled_count: + # If insufficient samples provided, pad with placeholder sentences + print(f"Warning: Expected {expected_sampled_count} sampled sentences, but got {len(sampled_sentences)}") + while len(sampled_sentences) < expected_sampled_count: + sampled_sentences.append(f"Placeholder sampled sentence {len(sampled_sentences) + 1}") + + # Add all sampled sentences with level information + for s in sampled_sentences[:expected_sampled_count]: + sampled_nodes.append(s + ' L1') + + nodes.extend(sampled_nodes) + + def apply_lcs_numbering(sentence, common_grams): + """ + Applies LCS numbering to the sentence based on the common_grams. + """ + for idx, lcs in common_grams: + sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence) + return sentence + + # Apply LCS numbering + nodes = [apply_lcs_numbering(node, common_grams) for node in nodes] + + def highlight_words(sentence, color_map): + """ + Highlights words in the sentence based on the color_map. + """ + for word, color in color_map.items(): + sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE) + return sentence + + # Helper function to color highlighted words + def color_highlighted_words(node, color_map): + """ + Colors the highlighted words in the node text. + """ + parts = re.split(r'(\{\{.*?\}\})', node) + colored_parts = [] + for part in parts: + match = re.match(r'\{\{(.*?)\}\}', part) + if match: + word = match.group(1) + color = color_map.get(word, 'black') + colored_parts.append(f"{word}") + else: + colored_parts.append(part) + return ''.join(colored_parts) + + # Clean nodes, highlight words, and wrap text + cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes] + global_color_map = dict(highlight_info) + highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes] + wrapped_nodes = ['
'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes] + + # Generate edges based on the tree structure + def get_levels_and_edges(nodes): + levels = {} + edges = [] + + # Extract level info from node labels + for i, node in enumerate(nodes): + level = int(node.split()[-1][1]) + levels[i] = level + + # Create edges from masked sentences to their sampled variants + for masked_idx in range(num_masked): + # For each masked sentence, create edges to its sampled variants + for technique_idx in range(num_sampled_per_masked): + sampled_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx + if sampled_idx < len(nodes): + edges.append((masked_idx, sampled_idx)) + + return levels, edges + + levels, edges = get_levels_and_edges(nodes) + + # Calculate positions with improved spacing + positions = {} + + # Calculate horizontal spacing for the root nodes (masked sentences) + root_x_spacing = 0 # All root nodes at x=0 + root_y_spacing = 8.0 # Vertical spacing between root nodes + + # Calculate positions for sampled nodes + sampled_x = 3 # X position for all sampled nodes + + # Calculate y positions for root nodes (masked sentences) + root_y_start = -(num_masked - 1) * root_y_spacing / 2 + for i in range(num_masked): + positions[i] = (root_x_spacing, root_y_start + i * root_y_spacing) + + # Calculate y positions for sampled nodes + for masked_idx in range(num_masked): + root_y = positions[masked_idx][1] # Y position of parent masked sentence + + # Calculate y-spacing for children of this root + children_y_spacing = 1.5 # Vertical spacing between children of the same root + children_y_start = root_y - (num_sampled_per_masked - 1) * children_y_spacing / 2 + + # Position each child + for technique_idx in range(num_sampled_per_masked): + child_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx + child_y = children_y_start + technique_idx * children_y_spacing + positions[child_idx] = (sampled_x, child_y) + + # Create figure + fig2 = go.Figure() + + # Add nodes + for i, node in enumerate(wrapped_nodes): + x, y = positions[i] + + # Define node color based on level + node_color = 'blue' if levels[i] == 0 else 'green' + + # Add the node marker + fig2.add_trace(go.Scatter( + x=[x], + y=[y], + mode='markers', + marker=dict(size=20, color=node_color, line=dict(color='black', width=2)), + hoverinfo='none' + )) + + # Add node label with highlighting + colored_node = color_highlighted_words(node, global_color_map) + + fig2.add_annotation( + x=x, + y=y, + text=colored_node, + showarrow=False, + xshift=15, + align="left", + font=dict(size=12), + bordercolor='black', + borderwidth=2, + borderpad=4, + bgcolor='white', + width=400, + height=100 + ) + + # Add edges with labels + for i, (src, dst) in enumerate(edges): + x0, y0 = positions[src] + x1, y1 = positions[dst] + + # Draw the edge + fig2.add_trace(go.Scatter( + x=[x0, x1], + y=[y0, y1], + mode='lines', + line=dict(color='black', width=1) + )) + + # Add sampling technique label + # Determine which sampling technique this is + parent_idx = src + technique_count = sum(1 for k, (s, _) in enumerate(edges) if s == parent_idx and k < i) + technique_label = sampling_techniques[technique_count % len(sampling_techniques)] + + # Calculate midpoint for the label + mid_x = (x0 + x1) / 2 + mid_y = (y0 + y1) / 2 + + # Add slight offset to avoid overlap + label_offset = 0.1 + + fig2.add_annotation( + x=mid_x, + y=mid_y + label_offset, + text=technique_label, + showarrow=False, + font=dict(size=8), + align="center" + ) + + # Update layout + fig2.update_layout( + showlegend=False, + margin=dict(t=20, b=20, l=20, r=20), + xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + width=1200, # Adjusted width to accommodate more levels + height=2000, # Adjusted height to accommodate more levels + plot_bgcolor='rgba(240,240,240,0.2)', + paper_bgcolor='white' + + ) + + return fig2 + +if __name__ == "__main__": + paraphrased_sentence = "The quick brown fox jumps over the lazy dog." + masked_sentences = [ + "A fast brown fox leaps over the lazy dog.", + "A quick brown fox hops over a lazy dog." + ] + highlight_info = [ + ("quick", "red"), + ("brown", "green"), + ("fox", "blue"), + ("lazy", "purple") + ] + common_grams = [ + (1, "quick brown fox"), + (2, "lazy dog") + ] + + fig1 = generate_subplot1(paraphrased_sentence, masked_sentences, highlight_info, common_grams) + fig1.show() + + sampled_sentence = ["A fast brown fox jumps over a lazy dog."] + + + fig2 = generate_subplot2(masked_sentences, sampled_sentence, highlight_info, common_grams) + fig2.show() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..576b038724ea2b6389f67fec2ebdbe2e21468e03 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,5 @@ +from utils.watermark import Watermarker +from utils.paraphraser import Paraphraser +from utils.entailment import EntailmentAnalyzer +from utils.sampling import SamplingProcessor +from utils.config import load_config \ No newline at end of file diff --git a/utils/__pycache__/__init__.cpython-310.pyc b/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30ce4a254bd80ace4f22d103dd473a1cf9b3283e Binary files /dev/null and b/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/utils/__pycache__/__init__.cpython-311.pyc b/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b5555a79f7cb76bcd3e4f723e9028acf1f26933 Binary files /dev/null and b/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/utils/__pycache__/config.cpython-310.pyc b/utils/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3582700fa53896a8194864be186753ee3a8ab82 Binary files /dev/null and b/utils/__pycache__/config.cpython-310.pyc differ diff --git a/utils/__pycache__/config.cpython-311.pyc b/utils/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c96b59a35d6728248efd21863acee2f81f1a645a Binary files /dev/null and b/utils/__pycache__/config.cpython-311.pyc differ diff --git a/utils/__pycache__/entailment.cpython-310.pyc b/utils/__pycache__/entailment.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68ad206c84236b0a2ec5d00f2bfe9ee69dc67e3c Binary files /dev/null and b/utils/__pycache__/entailment.cpython-310.pyc differ diff --git a/utils/__pycache__/entailment.cpython-311.pyc b/utils/__pycache__/entailment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6fc993c2719d8063a2bba33273f7adc968e2950 Binary files /dev/null and b/utils/__pycache__/entailment.cpython-311.pyc differ diff --git a/utils/__pycache__/masking_methods.cpython-310.pyc b/utils/__pycache__/masking_methods.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b99bf858b1d9cdc684d8d785f7d7c28733a7240 Binary files /dev/null and b/utils/__pycache__/masking_methods.cpython-310.pyc differ diff --git a/utils/__pycache__/masking_methods.cpython-311.pyc b/utils/__pycache__/masking_methods.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..018ef6f3519408daa5502ad0905b3d207341d76a Binary files /dev/null and b/utils/__pycache__/masking_methods.cpython-311.pyc differ diff --git a/utils/__pycache__/non_melting_point.cpython-310.pyc b/utils/__pycache__/non_melting_point.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23bd50bb431040f19671e32be8856a325ab25f80 Binary files /dev/null and b/utils/__pycache__/non_melting_point.cpython-310.pyc differ diff --git a/utils/__pycache__/non_melting_point.cpython-311.pyc b/utils/__pycache__/non_melting_point.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdb66bfaa7274f08f281416d1ada8bb975503381 Binary files /dev/null and b/utils/__pycache__/non_melting_point.cpython-311.pyc differ diff --git a/utils/__pycache__/paraphraser.cpython-310.pyc b/utils/__pycache__/paraphraser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aadaec55f8b172edcbd271df987813ef13c80eb4 Binary files /dev/null and b/utils/__pycache__/paraphraser.cpython-310.pyc differ diff --git a/utils/__pycache__/paraphraser.cpython-311.pyc b/utils/__pycache__/paraphraser.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06362f19d3beb20b5acf6e9f37130b73f000510c Binary files /dev/null and b/utils/__pycache__/paraphraser.cpython-311.pyc differ diff --git a/utils/__pycache__/sampling.cpython-310.pyc b/utils/__pycache__/sampling.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec8afbc843e5e7d97325fa9438f2fd686c5743b0 Binary files /dev/null and b/utils/__pycache__/sampling.cpython-310.pyc differ diff --git a/utils/__pycache__/sampling.cpython-311.pyc b/utils/__pycache__/sampling.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3d44f57068e2d56d8db99e96f55526db27eeb78 Binary files /dev/null and b/utils/__pycache__/sampling.cpython-311.pyc differ diff --git a/utils/__pycache__/watermark.cpython-310.pyc b/utils/__pycache__/watermark.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9943c62cbae737b816cd03eb62e9229ab1a3ba0e Binary files /dev/null and b/utils/__pycache__/watermark.cpython-310.pyc differ diff --git a/utils/__pycache__/watermark.cpython-311.pyc b/utils/__pycache__/watermark.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb5322d289d38244fa4f36d3acac5cb06b934edb Binary files /dev/null and b/utils/__pycache__/watermark.cpython-311.pyc differ diff --git a/utils/config.py b/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2d2997275e03a0ab219f78f65482d7cead8b5ebd --- /dev/null +++ b/utils/config.py @@ -0,0 +1,18 @@ +""" +This file loads config from config.yaml +""" + +import yaml + +def load_config(path): + """ + Function to load config from config.yaml + """ + try: + with open(path, "r") as file: + config = yaml.safe_load(file) + return config + except FileNotFoundError: + raise FileNotFoundError("Config file not found") + except Exception as e: + raise e diff --git a/utils/config.yaml b/utils/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f91e3fcf8a5ed368d540256d4a2489a4752f9b34 --- /dev/null +++ b/utils/config.yaml @@ -0,0 +1,48 @@ +# This is the official config file. +PECCAVI_TEXT: + Entailment: + task: "text-classification" + model: "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli" + + Masking: + task: "fill-mask" + tokenizer: "bert-base-uncased" + model: "bert-base-uncased" + # tokenizer: "bert-large-cased-whole-word-masking" + # model: "bert-large-cased-whole-word-masking" + + Vocabulary: + tokenizer: "bert-base-uncased" + model: "bert-base-uncased" + # permissible_ratio: 0.5 + # tokenizer: "bert-large-cased-whole-word-masking" + # model: "bert-large-cased-whole-word-masking" + permissible_ratio: 1.0 + + Sampling: + tokenizer: "bert-base-uncased" + model: "bert-base-uncased" + # tokenizer: "bert-large-cased-whole-word-masking" + # model: "bert-large-cased-whole-word-masking" + + Metrics: + EuclideanDistance: "sentence-transformers/all-MiniLM-L6-v2" + Distortion: "gpt2" + + Detector: + tokenizer: "bert-base-uncased" + model: "bert-base-uncased" + # tokenizer: "bert-large-cased-whole-word-masking" + # model: "bert-large-cased-whole-word-masking" + + Paraphrase: + tokenizer: "humarin/chatgpt_paraphraser_on_T5_base" + model: "humarin/chatgpt_paraphraser_on_T5_base" + num_beams: 10 + num_beam_groups: 10 + num_return_sequences: 10 + repetition_penalty: 10.0 + diversity_penalty: 3.0 + no_repeat_ngram_size: 2 + temperature: 0.7 + max_length: 64 diff --git a/utils/entailment.py b/utils/entailment.py new file mode 100644 index 0000000000000000000000000000000000000000..aba1608ea349ffd7649d0c83a053a6433d8e489a --- /dev/null +++ b/utils/entailment.py @@ -0,0 +1,107 @@ +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +import numpy as np +from transformers import pipeline +from typing import List +from utils.config import load_config + + +class EntailmentAnalyzer: + # def __init__(self, config_path: str): + def __init__(self, config): + """ + Initialize the EntailmentAnalyzer with the config file path. + + Args: + config_path: The path to the configuration file. + """ + # self.config = load_config(config_path)['PECCAVI_TEXT']['Entailment'] + self.config = config + self.entailment_pipeline = pipeline(task=self.config['task'], model=self.config['model']) + + def check_entailment(self, premise: str, hypothesis: str) -> float: + """ + Check entailment between the premise and hypothesis. + + Args: + premise: The premise sentence. + hypothesis: The hypothesis sentence. + + Returns: + float: The entailment score. + """ + results = self.entailment_pipeline(f"{premise} [SEP] {hypothesis}", top_k=None) + entailment_score = next(item['score'] for item in results if item['label'] == 'entailment') + return entailment_score + + def analyze_entailment(self, original_sentence: str, paraphrased_sentences: List[str], threshold: float) -> tuple: + """ + Analyze entailment scores for paraphrased sentences. If no selected sentences are found, + lower the threshold and rerun the analysis. + + Args: + original_sentence: The original sentence. + paraphrased_sentences: List of paraphrased sentences. + threshold: Minimum score to select a sentence. + + Returns: + tuple: A dictionary of all scores, selected sentences, and discarded sentences. + """ + all_sentences = {} + selected_sentences = {} + discarded_sentences = {} + + # Loop to reduce threshold if no sentences are selected + while not selected_sentences: + for paraphrased_sentence in paraphrased_sentences: + entailment_score = self.check_entailment(original_sentence, paraphrased_sentence) + + all_sentences[paraphrased_sentence] = entailment_score + if entailment_score >= threshold: + selected_sentences[paraphrased_sentence] = entailment_score + else: + discarded_sentences[paraphrased_sentence] = entailment_score + + # If no sentences are selected, lower the threshold + if not selected_sentences: + print(f"No selected sentences found. Lowering the threshold by 0.1 (from {threshold} to {threshold - 0.1}).") + threshold -= 0.1 + if threshold <= 0: + print("Threshold has reached 0. No sentences meet the criteria.") + break + + return all_sentences, selected_sentences, discarded_sentences + + +if __name__ == "__main__": + config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml') + + config_path = '/home/ashhar21137/text_wm/scratch/utils/config/config.yaml' + + config = load_config(config_path) + + entailment_analyzer = EntailmentAnalyzer(config['PECCAVI_TEXT']['Entailment']) + + all_sentences, selected_sentences, discarded_sentences = entailment_analyzer.analyze_entailment( + "The weather is nice today", + [ + "The climate is pleasant today", + "It's a good day weather-wise", + "Today, the weather is terrible", + "What a beautiful day it is", + "The sky is clear and the weather is perfect", + "It's pouring rain outside today", + "The weather isn't bad today", + "A lovely day for outdoor activities" + ], + 0.7 + ) + + print("----------------------- All Sentences -----------------------") + print(all_sentences) + print("----------------------- Discarded Sentences -----------------------") + print(discarded_sentences) + print("----------------------- Selected Sentences -----------------------") + print(selected_sentences) diff --git a/utils/masking_methods.py b/utils/masking_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..987afb2b026d04467b0c4e73486fd42961e2cb4f --- /dev/null +++ b/utils/masking_methods.py @@ -0,0 +1,304 @@ +import random +import torch +import logging +from transformers import BertTokenizer, BertForMaskedLM +from nltk.corpus import stopwords +import nltk +from transformers import RobertaTokenizer, RobertaForMaskedLM +from tqdm import tqdm + +# Set logging to WARNING for a cleaner terminal. +logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +# Ensure stopwords are downloaded +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +class MaskingProcessor: + def __init__(self, tokenizer, model): + self.tokenizer = tokenizer + self.model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.stop_words = set(stopwords.words('english')) + tqdm.write(f"[MaskingProcessor] Initialized on device: {self.device}") + + def remove_stopwords(self, words): + return [word for word in words if word.lower() not in self.stop_words] + + def adjust_ngram_indices(self, original_words, common_ngrams): + logger.info("Adjusting n-gram indices.") + non_stop_words = self.remove_stopwords(original_words) + original_to_non_stop = [] + non_stop_idx = 0 + for original_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + original_to_non_stop.append((original_idx, non_stop_idx)) + non_stop_idx += 1 + adjusted_ngrams = {} + for ngram, positions in common_ngrams.items(): + adjusted_positions = [] + for start, end in positions: + try: + new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start) + new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end) + adjusted_positions.append((new_start, new_end)) + except StopIteration: + continue + adjusted_ngrams[ngram] = adjusted_positions + return adjusted_ngrams + + def mask_sentence_random(self, sentence, common_ngrams): + tqdm.write(f"[MaskingProcessor] Masking (random) sentence: {sentence}") + original_words = sentence.split() + has_punctuation = False + punctuation = '' + if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + has_punctuation = True + punctuation = original_words[-1][-1] + original_words = original_words[:-1] + + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + mask_indices = [] + + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + if ngram_positions: + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + mask_index_before_ngram = random.randint(0, first_ngram_start-1) + mask_indices.append(mask_index_before_ngram) + + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + mask_indices.append(mask_index_between_ngrams) + + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + mask_indices.append(mask_index_after_ngram) + + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + if has_punctuation: + masked_words.append(punctuation) + + logger.info(f"Masked sentence (random): {' '.join(masked_words)}") + return " ".join(masked_words), original_mask_indices + + def mask_sentence_pseudorandom(self, sentence, common_ngrams): + logger.info(f"Masking sentence using pseudorandom strategy: {sentence}") + random.seed(3) + original_words = sentence.split() + has_punctuation = False + punctuation = '' + if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + has_punctuation = True + punctuation = original_words[-1][-1] + original_words = original_words[:-1] + + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + mask_indices = [] + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + + if ngram_positions: + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + mask_index_before_ngram = random.randint(0, first_ngram_start-1) + mask_indices.append(mask_index_before_ngram) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + mask_indices.append(mask_index_between_ngrams) + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + mask_indices.append(mask_index_after_ngram) + + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + if has_punctuation: + masked_words.append(punctuation) + + logger.info(f"Masked sentence (pseudorandom): {' '.join(masked_words)}") + return " ".join(masked_words), original_mask_indices + + def mask_sentence_entropy(self, sentence, common_ngrams): + logger.info(f"Masking sentence using entropy strategy: {sentence}") + original_words = sentence.split() + has_punctuation = False + punctuation = '' + if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + has_punctuation = True + punctuation = original_words[-1][-1] + original_words = original_words[:-1] + + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + mask_indices = [] + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + if ngram_positions: + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + candidate_positions = range(0, first_ngram_start) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + candidate_positions = range(end_prev + 1, start_next) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + candidate_positions = range(last_ngram_end + 1, len(non_stop_words)) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + if has_punctuation: + masked_words.append(punctuation) + + logger.info(f"Masked sentence (entropy): {' '.join(masked_words)}") + return " ".join(masked_words), original_mask_indices + + def calculate_mask_logits(self, original_sentence, original_mask_indices): + logger.info(f"Calculating mask logits for sentence: {original_sentence}") + words = original_sentence.split() + mask_logits = {} + for idx in original_mask_indices: + masked_words = words.copy() + masked_words[idx] = self.tokenizer.mask_token + masked_sentence = " ".join(masked_words) + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device) + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + mask_logits_tensor = logits[0, mask_token_index, :] + top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1) + top_tokens = [] + top_logits = [] + seen_words = set() + for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]): + token = self.tokenizer.convert_ids_to_tokens(token_id.item()) + if token.startswith('##'): + continue + word = self.tokenizer.convert_tokens_to_string([token]).strip() + if word and word not in seen_words: + seen_words.add(word) + top_tokens.append(word) + top_logits.append(logit.item()) + if len(top_tokens) == 50: + break + mask_logits[idx] = { + "tokens": top_tokens, + "logits": top_logits + } + logger.info("Completed calculating mask logits.") + return mask_logits + + def calculate_word_entropy(self, sentence, word_position): + logger.info(f"Calculating word entropy for position {word_position} in sentence: {sentence}") + words = sentence.split() + masked_words = words.copy() + masked_words[word_position] = self.tokenizer.mask_token + masked_sentence = " ".join(masked_words) + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device) + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1) + entropy = -torch.sum(probs * torch.log(probs + 1e-9)) + logger.info(f"Computed entropy: {entropy.item()}") + return entropy.item() + + def process_sentences(self, sentences_list, common_grams, method="random"): + tqdm.write(f"[MaskingProcessor] Processing sentences using method: {method}") + results = {} + for sentence, ngrams in tqdm(common_grams.items(), desc="Masking Sentences"): + words = sentence.split() + last_word = words[-1] + if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + words[-1] = last_word[:-1] + punctuation = last_word[-1] + processed_sentence = " ".join(words) + " " + punctuation + else: + processed_sentence = sentence + + if method == "random": + masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams) + elif method == "pseudorandom": + masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams) + else: # entropy + masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams) + + logits = self.calculate_mask_logits(processed_sentence, original_mask_indices) + results[sentence] = { + "masked_sentence": masked_sentence, + "mask_logits": logits + } + logger.info(f"Processed sentence: {sentence}") + tqdm.write("[MaskingProcessor] Completed processing sentences.") + return results + +if __name__ == "__main__": + sentences = [ + "The quick brown fox jumps over small cat the lazy dog everyday again and again .", + ] + result_dict = { + 'The quick brown fox jumps over small cat the lazy dog everyday again and again .': { + 'brown fox': [(2, 3)], + 'cat': [(7, 7)], + 'dog': [(10, 10)] + } + } + processor = MaskingProcessor( + BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking"), + BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") + ) + results_entropy = processor.process_sentences(sentences_list, common_grams, method="random") + for sentence, output in results_entropy.items(): + logger.info(f"Original Sentence (Random): {sentence}") + logger.info(f"Masked Sentence (Random): {output['masked_sentence']}") diff --git a/utils/non_melting_point.py b/utils/non_melting_point.py new file mode 100644 index 0000000000000000000000000000000000000000..b61788f5c06a22ec14d2c9af94cdff8fcee819d6 --- /dev/null +++ b/utils/non_melting_point.py @@ -0,0 +1,137 @@ +import nltk +import logging +from nltk.corpus import stopwords +from nltk.util import ngrams +from collections import Counter +import re +from tqdm import tqdm + +# Set logging to WARNING for minimal console output. +logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +class NgramProcessor: + def __init__(self): + try: + nltk.data.find('corpora/stopwords') + except LookupError: + nltk.download('stopwords') + self.stop_words = set(stopwords.words('english')) + tqdm.write("[NgramProcessor] Initialized with stopwords.") + + def remove_stopwords(self, text): + # No need for extensive logging inside this helper. + words = re.findall(r'\w+', text.lower()) + filtered_words = [word for word in words if word not in self.stop_words] + return ' '.join(filtered_words) + + def is_exact_match(self, ngram, sentences): + logger.info(f"Checking exact match for ngram: {ngram}") + result = all(ngram in sentence for sentence in sentences) + logger.info(f"Exact match result for '{ngram}': {result}") + return result + + def is_substring_of_any(self, ngram, common_ngrams): + logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.") + result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) + logger.info(f"Substring check result for '{ngram}': {result}") + return result + + def find_filtered_ngrams(self, sentences): + from collections import Counter + tqdm.write("[NgramProcessor] Cleaning sentences...") + sentences_cleaned = [self.remove_stopwords(sentence) + for sentence in tqdm(sentences, desc="Cleaning Sentences")] + ngram_lengths = [4, 3, 2, 1] + common_ngrams = [] + result = {} + for n in ngram_lengths: + ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned] + ngrams_counter = Counter(ngrams_list[0]) + for ngram in ngrams_counter: + ngram_str = ' '.join(ngram) + if any(word in self.stop_words for word in ngram_str.split()): + continue + if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams): + common_ngrams.append(ngram_str) + for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned), + total=len(sentences), + desc="Mapping N-grams"): + sentence_result = {} + original_words = sentence.split() + cleaned_words = cleaned_sentence.split() + index_map = {} + cleaned_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + index_map[cleaned_idx] = orig_idx + cleaned_idx += 1 + for ngram in common_ngrams: + ngram_words = ngram.split() + indices = [] + for i in range(len(cleaned_words) - len(ngram_words) + 1): + if cleaned_words[i:i + len(ngram_words)] == ngram_words: + if i in index_map: + start_idx = index_map[i] + end_idx = index_map.get(i + len(ngram_words) - 1, start_idx) + if end_idx - start_idx == len(ngram_words) - 1: + indices.append((start_idx, end_idx)) + + if indices: + sentence_result[ngram] = indices + result[sentence] = sentence_result + return result + + # def find_relative_order(self, sentence, common_ngrams): + # from tqdm import tqdm + # relative_order = [] + # for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False): + # index = sentence.find(ngram) + # if index != -1: + # relative_order.append((index, ngram)) + # return sorted(relative_order) + + def find_relative_order(self, sentence, common_ngrams): + from tqdm import tqdm + sentence = sentence.lower() + relative_order = [] + + for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False): + index = sentence.find(ngram.lower()) + if index != -1: + relative_order.append((index, ngram)) + + sorted_pairs = sorted(relative_order) + return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)] + +# Example usage +if __name__ == "__main__": + sentences = [ + "The quick brown fox jumps over the lazy dog .", + "A speedy brown fox jumps over a lazy dog.", + "A swift brown fox leaps over the lethargic dog.", + ] + processor = NgramProcessor() + common_ngrams = processor.find_filtered_ngrams(sentences) + print(common_ngrams) + # modified_output = list({ + # (indices[0][0], gram) + # for grams in common_ngrams.values() + # for gram, indices in grams.items() + # }) + # print(modified_output) + logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}") + for sentence in sentences: + order = processor.find_relative_order(sentence, common_ngrams[sentence]) + logger.info(f"Sentence: {sentence} -> Order: {order}") + + +""" + +{ +'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, +'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}, +'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]} +} +""" + diff --git a/utils/old/masking/masking_methods.py b/utils/old/masking/masking_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..fd80c84a38f205374a5b911d1e16a8d0fc27d2a0 --- /dev/null +++ b/utils/old/masking/masking_methods.py @@ -0,0 +1,355 @@ +import random +import torch +from transformers import BertTokenizer, BertForMaskedLM +from nltk.corpus import stopwords +import nltk + +# Ensure stopwords are downloaded +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +class MaskingProcessor: + def __init__(self, ): + self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") + self.stop_words = set(stopwords.words('english')) + + def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords): + """ + Adjust indices of common n-grams after removing stop words. + + Args: + words (list): List of words in the original sentence. + common_ngrams (dict): Common n-grams and their indices. + + Returns: + dict: Adjusted common n-grams and their indices. + """ + if not remove_stopwords: + return common_ngrams + + non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words] + adjusted_ngrams = {} + + for ngram, positions in common_ngrams.items(): + adjusted_positions = [] + for start, end in positions: + try: + new_start = non_stop_word_indices.index(start) + new_end = non_stop_word_indices.index(end) + adjusted_positions.append((new_start, new_end)) + except ValueError: + continue # Skip if indices cannot be mapped + adjusted_ngrams[ngram] = adjusted_positions + + return adjusted_ngrams + + # def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords): + # """ + # Mask one word before the first common n-gram, one between two n-grams, + # and one after the last common n-gram (random selection). + + # Args: + # original_sentence (str): Original sentence + # common_ngrams (dict): Common n-grams and their indices + + # Returns: + # str: Masked sentence with original stop words retained + # """ + # words = original_sentence.split() + # if remove_stopwords: + # non_stop_words = [word for word in words if word.lower() not in self.stop_words] + # non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words] + # else: + # non_stop_words = words + # non_stop_word_indices = list(range(len(words))) + # # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words + # adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords) + + # mask_indices = [] + # # Handle before the first common n-gram + # if adjusted_ngrams: + # first_ngram_start = list(adjusted_ngrams.values())[0][0][0] + # if first_ngram_start > 0: + # mask_indices.append(random.randint(0, first_ngram_start - 1)) + + # # Handle between common n-grams + # ngram_positions = list(adjusted_ngrams.values()) + # for i in range(len(ngram_positions) - 1): + # end_prev = ngram_positions[i][-1][1] + # start_next = ngram_positions[i + 1][0][0] + # if start_next > end_prev + 1: + # mask_indices.append(random.randint(end_prev + 1, start_next - 1)) + + # # Handle after the last common n-gram + # last_ngram_end = ngram_positions[-1][-1][1] + # if last_ngram_end < len(non_stop_words) - 1: + # mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1)) + + # # Mask the chosen indices + # original_masked_sentence = words[:] + # # for idx in mask_indices: + # # if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + # # non_stop_words[idx] = self.tokenizer.mask_token + # # original_masked_sentence[idx] = self.tokenizer.mask_token + # for idx in mask_indices: + # if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + # continue # Skip if index belongs to common n-grams + # if remove_stopwords: + # original_idx = non_stop_word_indices[idx] # Map back to original indices + # original_masked_sentence[original_idx] = self.tokenizer.mask_token + # else: + # original_masked_sentence[idx] = self.tokenizer.mask_token + + + # return " ".join(original_masked_sentence) + def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords): + """ + Mask one word before the first common n-gram, one between two n-grams, + and one after the last common n-gram (random selection). + + Args: + original_sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + remove_stopwords (bool): Whether to remove stop words + + Returns: + str: Masked sentence with original stop words retained + """ + words = original_sentence.split() + if remove_stopwords: + non_stop_words = [word for word in words if word.lower() not in self.stop_words] + non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words] + else: + non_stop_words = words + non_stop_word_indices = list(range(len(words))) + + adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords) + + # Collect all indices corresponding to common n-grams + common_ngram_indices = { + idx for ngram_positions in adjusted_ngrams.values() + for start, end in ngram_positions + for idx in range(start, end + 1) + } + + mask_indices = [] + # Handle before the first common n-gram + if adjusted_ngrams: + first_ngram_start = list(adjusted_ngrams.values())[0][0][0] + if first_ngram_start > 0: + potential_indices = [i for i in range(first_ngram_start) if i not in common_ngram_indices] + if potential_indices: + mask_indices.append(random.choice(potential_indices)) + + # Handle between common n-grams + ngram_positions = list(adjusted_ngrams.values()) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][-1][1] + start_next = ngram_positions[i + 1][0][0] + potential_indices = [i for i in range(end_prev + 1, start_next) if i not in common_ngram_indices] + if potential_indices: + mask_indices.append(random.choice(potential_indices)) + + # Handle after the last common n-gram + last_ngram_end = ngram_positions[-1][-1][1] + if last_ngram_end < len(non_stop_words) - 1: + potential_indices = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i not in common_ngram_indices] + if potential_indices: + mask_indices.append(random.choice(potential_indices)) + + # Mask the chosen indices + original_masked_sentence = words[:] + for idx in mask_indices: + if remove_stopwords: + original_idx = non_stop_word_indices[idx] # Map back to original indices + original_masked_sentence[original_idx] = self.tokenizer.mask_token + else: + original_masked_sentence[idx] = self.tokenizer.mask_token + + return " ".join(original_masked_sentence) + + def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords): + """ + Mask one word before the first common n-gram, one between two n-grams, + and one after the last common n-gram (highest entropy selection). + + Args: + original_sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence with original stop words retained + """ + words = original_sentence.split() + # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words + if remove_stopwords: + non_stop_words = [word for word in words if word.lower() not in self.stop_words] + non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words] + else: + non_stop_words = words + non_stop_word_indices = list(range(len(words))) + adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords) + entropy_scores = {} + + for idx, word in enumerate(non_stop_words): + if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + continue # Skip words in common n-grams + + masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:] + masked_sentence = " ".join(masked_sentence) + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + filtered_logits = logits[0, mask_token_index, :] + probs = torch.softmax(filtered_logits, dim=-1) + entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item() # Add epsilon to prevent log(0) + entropy_scores[idx] = entropy + + mask_indices = [] + + # Handle before the first common n-gram + if adjusted_ngrams: + first_ngram_start = list(adjusted_ngrams.values())[0][0][0] + candidates = [i for i in range(first_ngram_start) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Handle between common n-grams + ngram_positions = list(adjusted_ngrams.values()) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][-1][1] + start_next = ngram_positions[i + 1][0][0] + candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Handle after the last common n-gram + last_ngram_end = ngram_positions[-1][-1][1] + candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Mask the chosen indices + original_masked_sentence = words[:] + # for idx in mask_indices: + # non_stop_words[idx] = self.tokenizer.mask_token + # original_masked_sentence[idx] = self.tokenizer.mask_token + + for idx in mask_indices: + if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + continue # Skip if index belongs to common n-grams + if remove_stopwords: + original_idx = non_stop_word_indices[idx] # Map back to original indices + original_masked_sentence[original_idx] = self.tokenizer.mask_token + else: + original_masked_sentence[idx] = self.tokenizer.mask_token + + + return " ".join(original_masked_sentence) + + def calculate_mask_logits(self, masked_sentence): + """ + Calculate logits for masked tokens in the sentence using BERT. + + Args: + masked_sentence (str): Sentence with [MASK] tokens + + Returns: + dict: Masked token indices and their logits + """ + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index} + return mask_logits + + def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False): + """ + Process a list of sentences and calculate logits for masked tokens using the specified method. + + Args: + original_sentences (list): List of original sentences + result_dict (dict): Common n-grams and their indices for each sentence + method (str): Masking method ("random" or "entropy") + + Returns: + dict: Masked sentences and their logits for each sentence + """ + results = {} + + for sentence, ngrams in result_dict.items(): + if method == "random": + masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords) + elif method == "entropy": + masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords) + else: + raise ValueError("Invalid method. Choose 'random' or 'entropy'.") + + logits = self.calculate_mask_logits(masked_sentence) + results[sentence] = { + "masked_sentence": masked_sentence, + "mask_logits": logits + } + + return results + +# Example usage +if __name__ == "__main__": + # !!! Working both the cases regardless if the stopword is removed or not + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A speedy brown fox jumps over a lazy dog.", + "A swift brown fox leaps over the lethargic dog." + ] + result_dict ={ + 'The quick brown fox jumps over the lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]} + } + + + processor = MaskingProcessor() + results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=True) + # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False) + + for sentence, output in results_random.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {output['masked_sentence']}") + # # print(f"Mask Logits (Random): {output['mask_logits']}") + # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}') + # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}') + # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}') + print('--------------------------------') + # for mask_idx, logits in output["mask_logits"].items(): + # print(f"Logits for [MASK] at position {mask_idx}:") + # print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens + + + + + # result_dict = { + # "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + # "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + # "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]} + # } + + + # print('--------------------------------') + # for sentence, output in results_entropy.items(): + # print(f"Original Sentence (Entropy): {sentence}") + # print(f"Masked Sentence (Entropy): {output['masked_sentence']}") + # # print(f"Mask Logits (Entropy): {output['mask_logits']}") + # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}') + # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}') + # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}') \ No newline at end of file diff --git a/utils/old/masking/masking_methods_new_work.py b/utils/old/masking/masking_methods_new_work.py new file mode 100644 index 0000000000000000000000000000000000000000..b72482bce55f9257f68134017abb2076dd2d24a1 --- /dev/null +++ b/utils/old/masking/masking_methods_new_work.py @@ -0,0 +1,447 @@ +import random +import torch +from transformers import BertTokenizer, BertForMaskedLM +from nltk.corpus import stopwords +import nltk + +# Ensure stopwords are downloaded +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +class MaskingProcessor: + def __init__(self): + + self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") + self.stop_words = set(stopwords.words('english')) + + def remove_stopwords(self, words): + """ + Remove stopwords from the given list of words. + + Args: + words (list): List of words. + + Returns: + list: List of non-stop words. + """ + return [word for word in words if word.lower() not in self.stop_words] + + def adjust_ngram_indices(self, original_words, common_ngrams): + """ + Adjust indices of common n-grams after removing stopwords. + + Args: + original_words (list): Original list of words. + common_ngrams (dict): Common n-grams and their indices. + + Returns: + dict: Adjusted common n-grams with updated indices. + """ + non_stop_words = self.remove_stopwords(original_words) + original_to_non_stop = [] + non_stop_idx = 0 + + for original_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + original_to_non_stop.append((original_idx, non_stop_idx)) + non_stop_idx += 1 + + adjusted_ngrams = {} + for ngram, positions in common_ngrams.items(): + adjusted_positions = [] + for start, end in positions: + try: + new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start) + new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end) + adjusted_positions.append((new_start, new_end)) + except StopIteration: + continue # Skip if indices cannot be mapped + adjusted_ngrams[ngram] = adjusted_positions + + return adjusted_ngrams + + def mask_sentence_random(self, sentence, common_ngrams): + """ + Mask words in the sentence based on the specified rules after removing stopwords. + """ + original_words = sentence.split() + print(f' ---- original_words : {original_words} ----- ') + non_stop_words = self.remove_stopwords(original_words) + print(f' ---- non_stop_words : {non_stop_words} ----- ') + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + print(f' ---- common_ngrams : {common_ngrams} ----- ') + print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ') + + mask_indices = [] + + # Extract n-gram positions in non-stop words + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + + # Mask a word before the first common n-gram + if ngram_positions: + print(f' ---- ngram_positions : {ngram_positions} ----- ') + first_ngram_start = ngram_positions[0][0] + print(f' ---- first_ngram_start : {first_ngram_start} ----- ') + if first_ngram_start > 0: + mask_index_before_ngram = random.randint(0, first_ngram_start-1) + print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ') + mask_indices.append(mask_index_before_ngram) + + # Mask words between common n-grams + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + print(f' ---- end_prev : {end_prev} ----- ') # END INDICE FROM PREV LOOP FUNKNLKNLKNLKNLKNLKNLSKDNFLKSDHJFLSDJKFH:KLSDHF:LHKSDF:HJKLDFS:HJKLDFSHJK: + start_next = ngram_positions[i + 1][0] + print(f' ---- start_next : {start_next} ----- ') + if start_next > end_prev + 1: + mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ') + mask_indices.append(mask_index_between_ngrams) + + # Mask a word after the last common n-gram + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + print(f' ---- last_ngram_end : {last_ngram_end} ----- ') + mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ') + mask_indices.append(mask_index_after_ngram) + + # Create mapping from non-stop words to original indices + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + # Map mask indices from non-stop word positions to original positions + print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ') + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + print(f' ---- original_mask_indices : {original_mask_indices} ----- ') + + # Apply masks to the original sentence + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + return " ".join(masked_words) + + def mask_sentence_pseudorandom(self, sentence, common_ngrams): + """ + Mask words in the sentence based on the specified rules after removing stopwords. + """ + random.seed(42) + original_words = sentence.split() + print(f' ---- original_words : {original_words} ----- ') + non_stop_words = self.remove_stopwords(original_words) + print(f' ---- non_stop_words : {non_stop_words} ----- ') + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + print(f' ---- common_ngrams : {common_ngrams} ----- ') + print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ') + + mask_indices = [] + + # Extract n-gram positions in non-stop words + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + + # Mask a word before the first common n-gram + if ngram_positions: + print(f' ---- ngram_positions : {ngram_positions} ----- ') + first_ngram_start = ngram_positions[0][0] + print(f' ---- first_ngram_start : {first_ngram_start} ----- ') + if first_ngram_start > 0: + mask_index_before_ngram = random.randint(0, first_ngram_start-1) + print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ') + mask_indices.append(mask_index_before_ngram) + + # Mask words between common n-grams + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + print(f' ---- end_prev : {end_prev} ----- ') + start_next = ngram_positions[i + 1][0] + print(f' ---- start_next : {start_next} ----- ') + if start_next > end_prev + 1: + mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ') + mask_indices.append(mask_index_between_ngrams) + + # Mask a word after the last common n-gram + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + print(f' ---- last_ngram_end : {last_ngram_end} ----- ') + mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ') + mask_indices.append(mask_index_after_ngram) + + # Create mapping from non-stop words to original indices + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + # Map mask indices from non-stop word positions to original positions + print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ') + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + print(f' ---- original_mask_indices : {original_mask_indices} ----- ') + + # Apply masks to the original sentence + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + return " ".join(masked_words) + + + def calculate_word_entropy(self, sentence, word_position): + """ + Calculate entropy for a specific word position in the sentence. + + Args: + sentence (str): The input sentence + word_position (int): Position of the word to calculate entropy for + + Returns: + float: Entropy value for the word + """ + words = sentence.split() + masked_words = words.copy() + masked_words[word_position] = self.tokenizer.mask_token + masked_sentence = " ".join(masked_words) + + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + # Get probabilities for the masked position + probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1) + # Calculate entropy: -sum(p * log(p)) + entropy = -torch.sum(probs * torch.log(probs + 1e-9)) + + return entropy.item() + + def mask_sentence_entropy(self, sentence, common_ngrams): + """ + Mask words in the sentence based on entropy, following n-gram positioning rules. + + Args: + sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence + """ + original_words = sentence.split() + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + + # Create mapping from non-stop words to original indices + non_stop_to_original = {} + original_to_non_stop = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + original_to_non_stop[orig_idx] = non_stop_idx + non_stop_idx += 1 + + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + mask_indices = [] + + if ngram_positions: + # Handle words before first n-gram + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + # Calculate entropy for all candidate positions + candidate_positions = range(0, first_ngram_start) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) + for pos in candidate_positions] + # Select position with highest entropy + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + # Handle words between n-grams + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + candidate_positions = range(end_prev + 1, start_next) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) + for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + # Handle words after last n-gram + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + candidate_positions = range(last_ngram_end + 1, len(non_stop_words)) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) + for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + # Map mask indices to original sentence positions and apply masks + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + return " ".join(masked_words) + + + def calculate_mask_logits(self, masked_sentence): + """ + Calculate logits for masked tokens in the sentence using BERT. + + Args: + masked_sentence (str): Sentence with [MASK] tokens. + + Returns: + dict: Masked token indices and their logits. + """ + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index} + return mask_logits + + def process_sentences(self, sentences, result_dict, method="random"): + """ + Process sentences and calculate logits for masked tokens. + + Args: + sentences (list): List of sentences + result_dict (dict): Dictionary of common n-grams + method (str): Masking method ("random" or "entropy") + + Returns: + dict: Masked sentences and logits for each sentence + """ + results = {} + + for sentence, ngrams in result_dict.items(): + if method == "random": + masked_sentence = self.mask_sentence_random(sentence, ngrams) + elif method == "pseudorandom": + masked_sentence = self.mask_sentence_pseudorandom(sentence, ngrams) + else: # entropy + masked_sentence = self.mask_sentence_entropy(sentence, ngrams) + + logits = self.calculate_mask_logits(masked_sentence) + results[sentence] = { + "masked_sentence": masked_sentence, + "mask_logits": logits + } + + return results + + + +if __name__ == "__main__": + # !!! Working both the cases regardless if the stopword is removed or not + sentences = [ + "The quick brown fox jumps over the lazy dog everyday.", + # "A speedy brown fox jumps over a lazy dog.", + # "A swift brown fox leaps over the lethargic dog." + ] + result_dict ={ + 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]} + } + + + processor = MaskingProcessor() + # results_random = processor.process_sentences(sentences, result_dict) + results_entropy = processor.process_sentences(sentences, result_dict, method="random") + + # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False) + + for sentence, output in results_entropy.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {output['masked_sentence']}") + # print(f"Mask Logits (Random): {output['mask_logits']}") + print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}') + print(f' length of output["mask_logits"] : {len(output["mask_logits"])}') + print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}') + print('--------------------------------') + for mask_idx, logits in output["mask_logits"].items(): + print(f"Logits for [MASK] at position {mask_idx}:") + print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens + print(f' len(logits) : {len(logits)}') + + + + +# ------------------------------------------------------------------------------------------- + # def mask_sentence(self, sentence, common_ngrams): + # """ + # Mask words in the sentence based on the specified rules after removing stopwords. + + # Args: + # sentence (str): Original sentence. + # common_ngrams (dict): Common n-grams and their indices. + + # Returns: + # str: Masked sentence. + # """ + # original_words = sentence.split() + # print(f' ---- original_words : {original_words} ----- ') + # non_stop_words = self.remove_stopwords(original_words) + # print(f' ---- non_stop_words : {non_stop_words} ----- ') + # adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + # print(f' ---- common_ngrams : {common_ngrams} ----- ') + # print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ') + + # mask_indices = [] + + # # Extract n-gram positions in non-stop words + # ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + # print(f' ---- ngram_positions : {ngram_positions} ----- ') + # # Mask a word before the first common n-gram + # if ngram_positions: + # first_ngram_start = ngram_positions[0][0] + # print(f' ---- first_ngram_start : {first_ngram_start} ----- ') + # if first_ngram_start > 0: + # mask_index_before_ngram = random.randint(0, first_ngram_start-1) + # print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ') + # mask_indices.append(mask_index_before_ngram) + + # # Mask words between common n-grams + # for i in range(len(ngram_positions) - 1): + # end_prev = ngram_positions[i][1] + # print(f' ---- end_prev : {end_prev} ----- ') + # start_next = ngram_positions[i + 1][0] + # print(f' ---- start_next : {start_next} ----- ') + # if start_next > end_prev + 1: + # mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + # print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ') + # mask_indices.append(mask_index_between_ngrams) + + # # Mask a word after the last common n-gram + # last_ngram_end = ngram_positions[-1][1] + # print(f' ---- last_ngram_end : {last_ngram_end} ----- ') + # if last_ngram_end < len(non_stop_words) - 1: + # mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + # print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ') + # mask_indices.append(mask_index_after_ngram) + + # # Map mask indices back to original sentence + # adjusted_indices = [ + # orig for orig, non_stop in enumerate(original_words) + # if non_stop in mask_indices + # ] + + # # Apply masks to the original sentence + # for idx in adjusted_indices: + # original_words[idx] = self.tokenizer.mask_token + + # return " ".join(original_words) diff --git a/utils/old/masking/masking_methods_ok_working.py b/utils/old/masking/masking_methods_ok_working.py new file mode 100644 index 0000000000000000000000000000000000000000..59fb09c9a16273344f9f949a13be894403c16ddc --- /dev/null +++ b/utils/old/masking/masking_methods_ok_working.py @@ -0,0 +1,257 @@ +import random +import torch +from transformers import BertTokenizer, BertForMaskedLM +from nltk.corpus import stopwords +import nltk + +# Ensure stopwords are downloaded +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +class MaskingProcessor: + def __init__(self, ): + self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") + self.stop_words = set(stopwords.words('english')) + + def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords): + """ + Adjust indices of common n-grams after removing stop words. + + Args: + words (list): List of words in the original sentence. + common_ngrams (dict): Common n-grams and their indices. + + Returns: + dict: Adjusted common n-grams and their indices. + """ + if not remove_stopwords: + return common_ngrams + + non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words] + adjusted_ngrams = {} + + for ngram, positions in common_ngrams.items(): + adjusted_positions = [] + for start, end in positions: + try: + new_start = non_stop_word_indices.index(start) + new_end = non_stop_word_indices.index(end) + adjusted_positions.append((new_start, new_end)) + except ValueError: + continue # Skip if indices cannot be mapped + adjusted_ngrams[ngram] = adjusted_positions + + return adjusted_ngrams + + def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords): + """ + Mask one word before the first common n-gram, one between two n-grams, + and one after the last common n-gram (random selection). + + Args: + original_sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence with original stop words retained + """ + words = original_sentence.split() + non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words + adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords) + + mask_indices = [] + # Handle before the first common n-gram + if adjusted_ngrams: + first_ngram_start = list(adjusted_ngrams.values())[0][0][0] + if first_ngram_start > 0: + mask_indices.append(random.randint(0, first_ngram_start - 1)) + + # Handle between common n-grams + ngram_positions = list(adjusted_ngrams.values()) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][-1][1] + start_next = ngram_positions[i + 1][0][0] + if start_next > end_prev + 1: + mask_indices.append(random.randint(end_prev + 1, start_next - 1)) + + # Handle after the last common n-gram + last_ngram_end = ngram_positions[-1][-1][1] + if last_ngram_end < len(non_stop_words) - 1: + mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1)) + + # Mask the chosen indices + original_masked_sentence = words[:] + for idx in mask_indices: + if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + non_stop_words[idx] = self.tokenizer.mask_token + original_masked_sentence[idx] = self.tokenizer.mask_token + + return " ".join(original_masked_sentence) + + def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords): + """ + Mask one word before the first common n-gram, one between two n-grams, + and one after the last common n-gram (highest entropy selection). + + Args: + original_sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence with original stop words retained + """ + words = original_sentence.split() + non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words + adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords) + entropy_scores = {} + + for idx, word in enumerate(non_stop_words): + if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + continue # Skip words in common n-grams + + masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:] + masked_sentence = " ".join(masked_sentence) + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + filtered_logits = logits[0, mask_token_index, :] + probs = torch.softmax(filtered_logits, dim=-1) + entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item() # Add epsilon to prevent log(0) + entropy_scores[idx] = entropy + + mask_indices = [] + + # Handle before the first common n-gram + if adjusted_ngrams: + first_ngram_start = list(adjusted_ngrams.values())[0][0][0] + candidates = [i for i in range(first_ngram_start) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Handle between common n-grams + ngram_positions = list(adjusted_ngrams.values()) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][-1][1] + start_next = ngram_positions[i + 1][0][0] + candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Handle after the last common n-gram + last_ngram_end = ngram_positions[-1][-1][1] + candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Mask the chosen indices + original_masked_sentence = words[:] + for idx in mask_indices: + non_stop_words[idx] = self.tokenizer.mask_token + original_masked_sentence[idx] = self.tokenizer.mask_token + + return " ".join(original_masked_sentence) + + def calculate_mask_logits(self, masked_sentence): + """ + Calculate logits for masked tokens in the sentence using BERT. + + Args: + masked_sentence (str): Sentence with [MASK] tokens + + Returns: + dict: Masked token indices and their logits + """ + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index} + return mask_logits + + def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False): + """ + Process a list of sentences and calculate logits for masked tokens using the specified method. + + Args: + original_sentences (list): List of original sentences + result_dict (dict): Common n-grams and their indices for each sentence + method (str): Masking method ("random" or "entropy") + + Returns: + dict: Masked sentences and their logits for each sentence + """ + results = {} + + for sentence, ngrams in result_dict.items(): + if method == "random": + masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords) + elif method == "entropy": + masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords) + else: + raise ValueError("Invalid method. Choose 'random' or 'entropy'.") + + logits = self.calculate_mask_logits(masked_sentence) + results[sentence] = { + "masked_sentence": masked_sentence, + "mask_logits": logits + } + + return results + +# Example usage +if __name__ == "__main__": + # !!! Working both the cases regardless if the stopword is removed or not + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A quick brown dog outpaces a lazy fox.", + "Quick brown animals leap over lazy obstacles." + ] + + result_dict = { + "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]}, + "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]}, + "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]} + } + + # result_dict = { + # "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + # "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + # "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]} + # } + + processor = MaskingProcessor() + results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False) + # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False) + + for sentence, output in results_random.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {output['masked_sentence']}") + # print(f"Mask Logits (Random): {output['mask_logits']}") + print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}') + print(f' length of output["mask_logits"] : {len(output["mask_logits"])}') + print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}') + print('--------------------------------') + for mask_idx, logits in output["mask_logits"].items(): + print(f"Logits for [MASK] at position {mask_idx}:") + print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens + + + + # print('--------------------------------') + # for sentence, output in results_entropy.items(): + # print(f"Original Sentence (Entropy): {sentence}") + # print(f"Masked Sentence (Entropy): {output['masked_sentence']}") + # # print(f"Mask Logits (Entropy): {output['mask_logits']}") + # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}') + # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}') + # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}') \ No newline at end of file diff --git a/utils/old/masking/masking_methods_v1_working.py b/utils/old/masking/masking_methods_v1_working.py new file mode 100644 index 0000000000000000000000000000000000000000..12b4efc66e6188eb743db8fca94391f0ad212ec9 --- /dev/null +++ b/utils/old/masking/masking_methods_v1_working.py @@ -0,0 +1,233 @@ +import random +import torch +from transformers import BertTokenizer, BertForMaskedLM +from nltk.corpus import stopwords +import nltk + + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + +# Ensure stopwords are downloaded +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +class MaskingProcessor: + def __init__(self): + self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") + self.stop_words = set(stopwords.words('english')) + + def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords=False): + """ + Mask one word before the first common n-gram, one between two n-grams, + and one after the last common n-gram (random selection). + + Args: + original_sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence + """ + if remove_stopwords: + words = original_sentence.split() + words = [word for word in words if word not in self.stop_words] + else: + words = original_sentence.split() + + mask_indices = [] + # Handle before the first common n-gram + if common_ngrams: + first_ngram_start = list(common_ngrams.values())[0][0][0] + if first_ngram_start > 0: + mask_indices.append(random.randint(0, first_ngram_start - 1)) + + # Handle between common n-grams + ngram_positions = list(common_ngrams.values()) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][-1][1] + start_next = ngram_positions[i + 1][0][0] + if start_next > end_prev + 1: + mask_indices.append(random.randint(end_prev + 1, start_next - 1)) + + # Handle after the last common n-gram + last_ngram_end = ngram_positions[-1][-1][1] + if last_ngram_end < len(words) - 1: + mask_indices.append(random.randint(last_ngram_end + 1, len(words) - 1)) + + # Mask the chosen indices + for idx in mask_indices: + if idx not in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + words[idx] = self.tokenizer.mask_token + + return " ".join(words) + + def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords=False): + """ + Mask one word before the first common n-gram, one between two n-grams, + and one after the last common n-gram (highest entropy selection). + + Args: + original_sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence + """ + if remove_stopwords: + words = original_sentence.split() + words = [word for word in words if word not in self.stop_words] + else: + words = original_sentence.split() + entropy_scores = {} + + for idx, word in enumerate(words): + if idx in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]: + continue # Skip words in common n-grams + + masked_sentence = words[:idx] + [self.tokenizer.mask_token] + words[idx + 1:] + masked_sentence = " ".join(masked_sentence) + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + filtered_logits = logits[0, mask_token_index, :] + probs = torch.softmax(filtered_logits, dim=-1) + entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item() # Add epsilon to prevent log(0) + entropy_scores[idx] = entropy + + mask_indices = [] + + # Handle before the first common n-gram + if common_ngrams: + first_ngram_start = list(common_ngrams.values())[0][0][0] + candidates = [i for i in range(first_ngram_start) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Handle between common n-grams + ngram_positions = list(common_ngrams.values()) + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][-1][1] + start_next = ngram_positions[i + 1][0][0] + candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Handle after the last common n-gram + last_ngram_end = ngram_positions[-1][-1][1] + candidates = [i for i in range(last_ngram_end + 1, len(words)) if i in entropy_scores] + if candidates: + mask_indices.append(max(candidates, key=lambda x: entropy_scores[x])) + + # Mask the chosen indices + for idx in mask_indices: + words[idx] = self.tokenizer.mask_token + + return " ".join(words) + + def calculate_mask_logits(self, masked_sentence): + """ + Calculate logits for masked tokens in the sentence using BERT. + + Args: + masked_sentence (str): Sentence with [MASK] tokens + + Returns: + dict: Masked token indices and their logits + """ + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index} + return mask_logits + + def process_sentences(self, original_sentences, result_dict, remove_stopwords=False, method="random"): + """ + Process a list of sentences and calculate logits for masked tokens using the specified method. + + Args: + original_sentences (list): List of original sentences + result_dict (dict): Common n-grams and their indices for each sentence + method (str): Masking method ("random" or "entropy") + + Returns: + dict: Masked sentences and their logits for each sentence + """ + results = {} + + for sentence, ngrams in result_dict.items(): + if method == "random": + masked_sentence = self.mask_sentence_random(sentence, ngrams) + elif method == "entropy": + masked_sentence = self.mask_sentence_entropy(sentence, ngrams) + else: + raise ValueError("Invalid method. Choose 'random' or 'entropy'.") + + logits = self.calculate_mask_logits(masked_sentence) + results[sentence] = { + "masked_sentence": masked_sentence, + "mask_logits": logits + } + + return results + +# Example usage +if __name__ == "__main__": + # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS + + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A quick brown dog outpaces a lazy fox.", + "Quick brown animals leap over lazy obstacles." + ] + + result_dict = { + "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]}, + "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]}, + "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]} + } + + # result_dict = { + # "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + # "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + # "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]} + # } + + processor = MaskingProcessor() + results_random = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="random") + results_entropy = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="entropy") + + for sentence, output in results_random.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {output['masked_sentence']}") + # print(f"Mask Logits (Random): {output['mask_logits']}") + + for sentence, output in results_entropy.items(): + print(f"Original Sentence (Entropy): {sentence}") + print(f"Masked Sentence (Entropy): {output['masked_sentence']}") + # print(f"Mask Logits (Entropy): {output['mask_logits']}") + + + + +''' + result_dict = { + "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}, + "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]} + } + +''' \ No newline at end of file diff --git a/utils/old/masking_methods_final_copy.py b/utils/old/masking_methods_final_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..e54f32c6de9c0c33da175c60beb376d632afaf6f --- /dev/null +++ b/utils/old/masking_methods_final_copy.py @@ -0,0 +1,619 @@ +import random +import torch +from transformers import BertTokenizer, BertForMaskedLM +from nltk.corpus import stopwords +import nltk +from transformers import RobertaTokenizer, RobertaForMaskedLM + + +# Ensure stopwords are downloaded +try: + nltk.data.find('corpora/stopwords') +except LookupError: + nltk.download('stopwords') + +class MaskingProcessor: + # def __init__(self, tokenizer, model): + def __init__(self): + # self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + # self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") + + # self.tokenizer = tokenizer + # self.model = model + + self.tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking") + self.model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") + + # self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base") + # self.model = RobertaForMaskedLM.from_pretrained("roberta-base") + + self.stop_words = set(stopwords.words('english')) + + def remove_stopwords(self, words): + """ + Remove stopwords from the given list of words. + + Args: + words (list): List of words. + + Returns: + list: List of non-stop words. + """ + return [word for word in words if word.lower() not in self.stop_words] + + def adjust_ngram_indices(self, original_words, common_ngrams): + """ + Adjust indices of common n-grams after removing stopwords. + + Args: + original_words (list): Original list of words. + common_ngrams (dict): Common n-grams and their indices. + + Returns: + dict: Adjusted common n-grams with updated indices. + """ + non_stop_words = self.remove_stopwords(original_words) + original_to_non_stop = [] + non_stop_idx = 0 + + for original_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + original_to_non_stop.append((original_idx, non_stop_idx)) + non_stop_idx += 1 + + adjusted_ngrams = {} + for ngram, positions in common_ngrams.items(): + adjusted_positions = [] + for start, end in positions: + try: + new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start) + new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end) + adjusted_positions.append((new_start, new_end)) + except StopIteration: + continue # Skip if indices cannot be mapped + adjusted_ngrams[ngram] = adjusted_positions + + return adjusted_ngrams + + def mask_sentence_random(self, sentence, common_ngrams): + """ + Mask words in the sentence based on the specified rules after removing stopwords. + """ + # Split sentence into words + original_words = sentence.split() + + # Handle punctuation at the end + has_punctuation = False + punctuation = None + if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + has_punctuation = True + punctuation = original_words[-1][-1] + original_words = original_words[:-1] + + print(f' ---- original_words : {original_words} ----- ') + + # Process words without punctuation + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + + # Rest of the existing function code... + mask_indices = [] + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + + if ngram_positions: + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + mask_index_before_ngram = random.randint(0, first_ngram_start-1) + mask_indices.append(mask_index_before_ngram) + + # Mask words between common n-grams + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + mask_indices.append(mask_index_between_ngrams) + + # Mask a word after the last common n-gram + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + mask_indices.append(mask_index_after_ngram) + + # Create mapping from non-stop words to original indices + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + # Map mask indices and apply masks + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + # masked_words[idx] = '' # for roberta + + # Add back punctuation if it existed + if has_punctuation: + masked_words.append(punctuation) + + print(f' ***** masked_words at end : {masked_words} ***** ') + print(f' ***** original_mask_indices : {original_mask_indices} ***** ') + print(f' ***** TESTING : {" ".join(masked_words)} ***** ') + + return " ".join(masked_words), original_mask_indices + + def mask_sentence_pseudorandom(self, sentence, common_ngrams): + """ + Mask words in the sentence based on the specified rules after removing stopwords. + """ + # Split sentence into words + random.seed(3) + original_words = sentence.split() + + # Handle punctuation at the end + has_punctuation = False + punctuation = None + if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + has_punctuation = True + punctuation = original_words[-1][-1] + original_words = original_words[:-1] + + print(f' ---- original_words : {original_words} ----- ') + + # Process words without punctuation + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + + # Rest of the existing function code... + mask_indices = [] + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + + if ngram_positions: + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + mask_index_before_ngram = random.randint(0, first_ngram_start-1) + mask_indices.append(mask_index_before_ngram) + + # Mask words between common n-grams + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + mask_indices.append(mask_index_between_ngrams) + + # Mask a word after the last common n-gram + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + mask_indices.append(mask_index_after_ngram) + + # Create mapping from non-stop words to original indices + non_stop_to_original = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + non_stop_idx += 1 + + # Map mask indices and apply masks + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + # masked_words[idx] = '' # for roberta + + # Add back punctuation if it existed + if has_punctuation: + masked_words.append(punctuation) + + print(f' ***** masked_words at end : {masked_words} ***** ') + print(f' ***** original_mask_indices : {original_mask_indices} ***** ') + print(f' ***** TESTING : {" ".join(masked_words)} ***** ') + + return " ".join(masked_words), original_mask_indices + + + def calculate_word_entropy(self, sentence, word_position): + """ + Calculate entropy for a specific word position in the sentence. + + Args: + sentence (str): The input sentence + word_position (int): Position of the word to calculate entropy for + + Returns: + float: Entropy value for the word + """ + words = sentence.split() + masked_words = words.copy() + masked_words[word_position] = self.tokenizer.mask_token + masked_sentence = " ".join(masked_words) + + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + # Get probabilities for the masked position + probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1) + # Calculate entropy: -sum(p * log(p)) + entropy = -torch.sum(probs * torch.log(probs + 1e-9)) + + return entropy.item() + + def mask_sentence_entropy(self, sentence, common_ngrams): + """ + Mask words in the sentence based on entropy, following n-gram positioning rules. + + Args: + sentence (str): Original sentence + common_ngrams (dict): Common n-grams and their indices + + Returns: + str: Masked sentence + """ + # Split sentence into words + original_words = sentence.split() + + # Handle punctuation at the end + has_punctuation = False + punctuation = None + if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + has_punctuation = True + punctuation = original_words[-1][-1] + original_words = original_words[:-1] + + # Process words without punctuation + non_stop_words = self.remove_stopwords(original_words) + adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + + # Create mapping from non-stop words to original indices + non_stop_to_original = {} + original_to_non_stop = {} + non_stop_idx = 0 + for orig_idx, word in enumerate(original_words): + if word.lower() not in self.stop_words: + non_stop_to_original[non_stop_idx] = orig_idx + original_to_non_stop[orig_idx] = non_stop_idx + non_stop_idx += 1 + + ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + mask_indices = [] + + if ngram_positions: + # Handle words before first n-gram + first_ngram_start = ngram_positions[0][0] + if first_ngram_start > 0: + candidate_positions = range(0, first_ngram_start) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) + for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + # Handle words between n-grams + for i in range(len(ngram_positions) - 1): + end_prev = ngram_positions[i][1] + start_next = ngram_positions[i + 1][0] + if start_next > end_prev + 1: + candidate_positions = range(end_prev + 1, start_next) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) + for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + # Handle words after last n-gram + last_ngram_end = ngram_positions[-1][1] + if last_ngram_end < len(non_stop_words) - 1: + candidate_positions = range(last_ngram_end + 1, len(non_stop_words)) + entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) + for pos in candidate_positions] + mask_indices.append(max(entropies, key=lambda x: x[1])[0]) + + # Map mask indices to original sentence positions and apply masks + original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + masked_words = original_words.copy() + for idx in original_mask_indices: + masked_words[idx] = self.tokenizer.mask_token + + # Add back punctuation if it existed + if has_punctuation: + masked_words.append(punctuation) + + return " ".join(masked_words), original_mask_indices + + def calculate_mask_logits(self, original_sentence, original_mask_indices): + """ + Calculate logits for masked tokens in the sentence using BERT. + + Args: + original_sentence (str): Original sentence without masks + original_mask_indices (list): List of indices to mask + + Returns: + dict: Masked token indices and their logits + """ + print('==========================================================================================================') + words = original_sentence.split() + print(f' ##### calculate_mask_logits >> words : {words} ##### ') + mask_logits = {} + + for idx in original_mask_indices: + # Create a copy of words and mask the current position + print(f' ---- idx : {idx} ----- ') + masked_words = words.copy() + masked_words[idx] = '[MASK]' + # masked_words[idx] = '' # for roberta + masked_sentence = " ".join(masked_words) + print(f' ---- masked_sentence : {masked_sentence} ----- ') + + # Calculate logits for the current mask + input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + # Extract logits for the masked position + mask_logits_tensor = logits[0, mask_token_index, :] + + # Get top logits and corresponding tokens + top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1) # Get more candidates + + # Convert token IDs to words and filter out subword tokens + top_tokens = [] + top_logits = [] + seen_words = set() # To keep track of unique words + + for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]): + token = self.tokenizer.convert_ids_to_tokens(token_id.item()) + + # Skip if it's a subword token (starts with ##) + if token.startswith('##'): + continue + + # Convert token to proper word + word = self.tokenizer.convert_tokens_to_string([token]).strip() + + # Only add if it's a new word and not empty + if word and word not in seen_words: + seen_words.add(word) + top_tokens.append(word) + top_logits.append(logit.item()) + + # Break if we have 50 unique complete words + if len(top_tokens) == 50: + break + + # print(f' ---- top_tokens : {top_tokens} ----- ') + + # Store results + mask_logits[idx] = { + "tokens": top_tokens, + "logits": top_logits + } + + return mask_logits + + # def calculate_mask_logits(self, original_sentence, original_mask_indices): + # """ + # Calculate logits for masked tokens in the sentence using BERT. + + # Args: + # original_sentence (str): Original sentence without masks + # original_mask_indices (list): List of indices to mask + + # Returns: + # dict: Masked token indices and their logits + # """ + # words = original_sentence.split() + # print(f' ##### calculate_mask_logits >> words : {words} ##### ') + # mask_logits = {} + + # for idx in original_mask_indices: + # # Create a copy of words and mask the current position + # print(f' ---- idx : {idx} ----- ') + # masked_words = words.copy() + # print(f' ---- words : {masked_words} ----- ') + # # masked_words[idx] = self.tokenizer.mask_token + # masked_words[idx] = '[MASK]' + # print(f' ---- masked_words : {masked_words} ----- ') + # masked_sentence = " ".join(masked_words) + # print(f' ---- masked_sentence : {masked_sentence} ----- ') + + # # Calculate logits for the current mask + # input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"] + # mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + # with torch.no_grad(): + # outputs = self.model(input_ids) + # logits = outputs.logits + + # # Extract logits for the masked position + # mask_logits_tensor = logits[0, mask_token_index, :] + + # # Get top 50 logits and corresponding tokens + # top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 50, dim=-1) + + # # Convert token IDs to words + # top_tokens = [self.tokenizer.convert_ids_to_tokens(token_id.item()) for token_id in top_mask_indices[0]] + # print(f' ---- top_tokens : {top_tokens} ----- ') + + # # Store results + # mask_logits[idx] = { + # "tokens": top_tokens, + # "logits": top_mask_logits.tolist() + # } + + # return mask_logits + + + def process_sentences(self, sentences, result_dict, method="random"): + """ + Process sentences and calculate logits for masked tokens. + """ + results = {} + + for sentence, ngrams in result_dict.items(): + # Split punctuation from the last word before processing + words = sentence.split() + last_word = words[-1] + if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']): + # Split the last word and punctuation + words[-1] = last_word[:-1] + punctuation = last_word[-1] + # Rejoin with space before punctuation to treat it as separate token + processed_sentence = " ".join(words) + " " + punctuation + else: + processed_sentence = sentence + + if method == "random": + masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams) + elif method == "pseudorandom": + masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams) + else: # entropy + masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams) + + logits = self.calculate_mask_logits(processed_sentence, original_mask_indices) + results[sentence] = { + "masked_sentence": masked_sentence, + "mask_logits": logits + } + + return results + + + +if __name__ == "__main__": + # !!! Working both the cases regardless if the stopword is removed or not + sentences = [ + "The quick brown fox jumps over small cat the lazy dog everyday again and again .", + # "A speedy brown fox jumps over a lazy dog.", + # "A swift brown fox leaps over the lethargic dog." + + ] + result_dict ={ + 'The quick brown fox jumps over small cat the lazy dog everyday again and again .': {'brown fox': [(2, 3)],'cat': [(7, 7)], 'dog': [(10, 10)]}, + # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]} + } + + + processor = MaskingProcessor() + # results_random = processor.process_sentences(sentences, result_dict) + results_entropy = processor.process_sentences(sentences, result_dict, method="random") + + ''' + results structure : + results = { + "The quick brown fox jumps over the lazy dog everyday.": + { # Original sentence as key + "masked_sentence": str, # The sentence with [MASK] tokens + "mask_logits": + { # Dictionary of mask positions and their predictions + 1: + { # Position of mask in sentence + "tokens" (words) : list, # List of top 50 predicted tokens + "logits" (probabilities) : list # Corresponding logits for those tokens + }, + 7: + { + "tokens" (words) : list, + "logits" (probabilities) : list + }, + 10: + { + "tokens (words)": list, + "logits (probabilities)": list + } + } + } + } + + ''' + # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False) + + for sentence, output in results_entropy.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {output['masked_sentence']}") + # print(f"Mask Logits (Random): {output['mask_logits']}") + # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}') + # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}') + # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}') + # print('--------------------------------') + # for mask_idx, logits in output["mask_logits"].items(): + # print(f"Logits for [MASK] at position {mask_idx}:") + # print(f' logits : {logits[:5]}') # List of logits for all vocabulary tokens + # print(f' len(logits) : {len(logits)}') + + +# ------------------------------------------------------------------------------------------------ + # def mask_sentence_random(self, sentence, common_ngrams): + # """ + # Mask words in the sentence based on the specified rules after removing stopwords. + # """ + # original_words = sentence.split() + # # print(f' ---- original_words : {original_words} ----- ') + # non_stop_words = self.remove_stopwords(original_words) + # # print(f' ---- non_stop_words : {non_stop_words} ----- ') + # adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams) + # # print(f' ---- common_ngrams : {common_ngrams} ----- ') + # # print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ') + + # mask_indices = [] + + # # Extract n-gram positions in non-stop words + # ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions] + + # # Mask a word before the first common n-gram + # if ngram_positions: + # # print(f' ---- ngram_positions : {ngram_positions} ----- ') + # first_ngram_start = ngram_positions[0][0] + # # print(f' ---- first_ngram_start : {first_ngram_start} ----- ') + # if first_ngram_start > 0: + # mask_index_before_ngram = random.randint(0, first_ngram_start-1) + # # print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ') + # mask_indices.append(mask_index_before_ngram) + + # # Mask words between common n-grams + # for i in range(len(ngram_positions) - 1): + # end_prev = ngram_positions[i][1] + # # print(f' ---- end_prev : {end_prev} ----- ') + # start_next = ngram_positions[i + 1][0] + # # print(f' ---- start_next : {start_next} ----- ') + # if start_next > end_prev + 1: + # mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1) + # # print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ') + # mask_indices.append(mask_index_between_ngrams) + + # # Mask a word after the last common n-gram + # last_ngram_end = ngram_positions[-1][1] + # if last_ngram_end < len(non_stop_words) - 1: + # # print(f' ---- last_ngram_end : {last_ngram_end} ----- ') + # mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1) + # # print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ') + # mask_indices.append(mask_index_after_ngram) + + # # Create mapping from non-stop words to original indices + # non_stop_to_original = {} + # non_stop_idx = 0 + # for orig_idx, word in enumerate(original_words): + # if word.lower() not in self.stop_words: + # non_stop_to_original[non_stop_idx] = orig_idx + # non_stop_idx += 1 + + # # Map mask indices from non-stop word positions to original positions + # # print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ') + # original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices] + # # print(f' ---- original_mask_indices : {original_mask_indices} ----- ') + + # # Apply masks to the original sentence + # masked_words = original_words.copy() + # for idx in original_mask_indices: + # masked_words[idx] = self.tokenizer.mask_token + + # return " ".join(masked_words), original_mask_indices diff --git a/utils/old/non_melting_points_v1.py b/utils/old/non_melting_points_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..2a6369ecfc79e18cba487f54f59f8e00cdaf58be --- /dev/null +++ b/utils/old/non_melting_points_v1.py @@ -0,0 +1,244 @@ +import nltk +from nltk.corpus import stopwords +from nltk.util import ngrams +from collections import Counter +import re + +class NgramProcessor: + def __init__(self): + try: + nltk.data.find('corpora/stopwords') + except LookupError: + nltk.download('stopwords') + + self.stop_words = set(stopwords.words('english')) + + def remove_stopwords(self, text): + """ + Remove stopwords using NLTK's stopword list + + Args: + text (str): Input text + + Returns: + str: Cleaned text with stopwords removed + """ + words = re.findall(r'\w+', text.lower()) + filtered_words = [word for word in words if word not in self.stop_words] + return ' '.join(filtered_words) + + def is_exact_match(self, ngram, sentences): + """ + Check if the given n-gram has an exact match in all sentences + + Args: + ngram (str): The n-gram to search for + sentences (list): List of sentences to search in + + Returns: + bool: True if n-gram has exact match in all sentences, False otherwise + """ + return all(ngram in sentence for sentence in sentences) + + def is_substring_of_any(self, ngram, common_ngrams): + """ + Check if the given n-gram is an exact substring of any previously found common n-grams + + Args: + ngram (str): The n-gram to check + common_ngrams (list): List of previously found common n-grams + + Returns: + bool: True if ngram is a substring of any common_ngrams, False otherwise + """ + return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) + + def find_filtered_ngrams(self, sentences): + """ + Find all n-grams that have exact matches across all sentences, + excluding those that are part of larger common n-grams + + Args: + sentences (list): List of sentences to analyze + + Returns: + list: List of tuples where each tuple contains the n-gram and its indices in each sentence + """ + original_sentences = sentences[:] + sentences = [self.remove_stopwords(sentence) for sentence in sentences] + ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram + common_ngrams = [] + + for n in ngram_lengths: + ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] + ngrams_counter = Counter(ngrams_list[0]) + + for ngram in ngrams_counter: + ngram_str = ' '.join(ngram) + if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]): + indices = [] + for original_sentence in original_sentences: + words = original_sentence.split() + ngram_indices = [ + (i, i + n - 1) for i in range(len(words) - n + 1) + if ' '.join(words[i:i + n]).lower() == ngram_str + ] + indices.append(ngram_indices) + common_ngrams.append((ngram_str, indices)) + + return common_ngrams + + def find_relative_order(self, sentence, common_ngrams): + """ + Find the relative order of the common n-grams in the sentence + + Args: + sentence (str): Sentence in which to find the relative order + common_ngrams (list): List of common n-grams + + Returns: + list: List of tuples with the relative position and the n-gram + """ + relative_order = [] + for ngram, _ in common_ngrams: + index = sentence.find(ngram) + if index != -1: + relative_order.append((index, ngram)) + + return sorted(relative_order) + +# Example usage +if __name__ == "__main__": + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A quick brown dog outpaces a lazy fox.", + "Quick brown animals leap over lazy obstacles." + ] + + processor = NgramProcessor() + common_ngrams = processor.find_filtered_ngrams(sentences) + print("Common n-grams and their indices:") + for ngram, indices in common_ngrams: + print(f"{ngram}: {indices}") + + for sentence in sentences: + relative_order = processor.find_relative_order(sentence, common_ngrams) + print(f"Relative order in sentence '{sentence}':", relative_order) + + + +# import nltk +# from nltk.corpus import stopwords +# from nltk.util import ngrams +# from collections import Counter +# import re + +# class NgramProcessor: +# def __init__(self): +# try: +# nltk.data.find('corpora/stopwords') +# except LookupError: +# nltk.download('stopwords') + +# self.stop_words = set(stopwords.words('english')) + +# def remove_stopwords(self, text): +# """ +# Remove stopwords using NLTK's stopword list + +# Args: +# text (str): Input text + +# Returns: +# str: Cleaned text with stopwords removed +# """ +# words = re.findall(r'\w+', text.lower()) +# filtered_words = [word for word in words if word not in self.stop_words] +# return ' '.join(filtered_words) + +# def is_exact_match(self, ngram, sentences): +# """ +# Check if the given n-gram has an exact match in all sentences + +# Args: +# ngram (str): The n-gram to search for +# sentences (list): List of sentences to search in + +# Returns: +# bool: True if n-gram has exact match in all sentences, False otherwise +# """ +# return all(ngram in sentence for sentence in sentences) + +# def is_substring_of_any(self, ngram, common_ngrams): +# """ +# Check if the given n-gram is an exact substring of any previously found common n-grams + +# Args: +# ngram (str): The n-gram to check +# common_ngrams (list): List of previously found common n-grams + +# Returns: +# bool: True if ngram is a substring of any common_ngrams, False otherwise +# """ +# return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) + +# def find_filtered_ngrams(self, sentences): +# """ +# Find all n-grams that have exact matches across all sentences, +# excluding those that are part of larger common n-grams + +# Args: +# sentences (list): List of sentences to analyze + +# Returns: +# list: List of all common n-grams in order of their appearance in the first sentence +# """ +# sentences = [self.remove_stopwords(sentence) for sentence in sentences] +# ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram +# common_ngrams = [] + +# for n in ngram_lengths: +# ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] +# ngrams_counter = Counter(ngrams_list[0]) + +# for ngram in ngrams_counter: +# ngram_str = ' '.join(ngram) +# if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams): +# common_ngrams.append(ngram_str) + +# return common_ngrams + +# def find_relative_order(self, sentence, common_ngrams): +# """ +# Find the relative order of the common n-grams in the sentence + +# Args: +# sentence (str): Sentence in which to find the relative order +# common_ngrams (list): List of common n-grams + +# Returns: +# list: List of tuples with the relative position and the n-gram +# """ +# relative_order = [] +# for ngram in common_ngrams: +# index = sentence.find(ngram) +# if index != -1: +# relative_order.append((index, ngram)) + +# return sorted(relative_order) + +# # Example usage +# if __name__ == "__main__": +# sentences = [ +# "The quick brown fox jumps over the lazy dog.", +# "A quick brown dog outpaces a lazy fox.", +# "Quick brown animals leap over lazy obstacles." +# ] + +# processor = NgramProcessor() +# common_ngrams = processor.find_filtered_ngrams(sentences) +# print("Common n-grams:", common_ngrams) + +# for sentence in sentences: +# relative_order = processor.find_relative_order(sentence, common_ngrams) +# print(f"Relative order in sentence '{sentence}':", relative_order) diff --git a/utils/old/sampling/sampling.py b/utils/old/sampling/sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..5c0a58912beeb4d15d0c982d790d22423594c359 --- /dev/null +++ b/utils/old/sampling/sampling.py @@ -0,0 +1,330 @@ +import torch +import random +from masking_methods import MaskingProcessor +import nltk +from nltk.corpus import words +import torch.nn.functional as F + + +class SamplingProcessor: + def __init__(self, tokenizer): + """ + Initialize the SamplingProcessor. + + Args: + tokenizer: BERT tokenizer instance + """ + self.tokenizer = tokenizer + self.subtoken_prefix = self._get_subtoken_prefix() + self.subtoken_ids = self._get_subtoken_ids() + try: + nltk.data.find('corpora/words') + except LookupError: + nltk.download('words') + self.english_words = set(words.words()) + + # def _get_subtoken_prefix(self): + # """ + # Identify the subtoken prefix based on the tokenizer. + + # Returns: + # str: The prefix used for subtokens (e.g., "##" for BERT). + # """ + # # This method assumes that the tokenizer uses a consistent subtoken prefix. + # # Adjust accordingly if using different tokenizers. + # # For BERT's WordPiece tokenizer: + # if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs: + # return self.tokenizer.init_kwargs["wordpiece_prefix"] + # elif hasattr(self.tokenizer, "prefix_tokens"): + # return self.tokenizer.prefix_tokens + # else: + # # Default to BERT's subtoken prefix + # return "##" + + def _get_subtoken_prefix(self): + """ + Identify the subtoken prefix based on the tokenizer. + + Returns: + str: The prefix used for subtokens (e.g., "##" for BERT). + """ + # This method assumes that the tokenizer uses a consistent subtoken prefix. + # Adjust accordingly if using different tokenizers. + # For BERT's WordPiece tokenizer: + if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs: + return self.tokenizer.init_kwargs["wordpiece_prefix"] + elif hasattr(self.tokenizer, "prefix_tokens"): + return self.tokenizer.prefix_tokens + else: + # Default to BERT's subtoken prefix + return "##" + + + # def _get_subtoken_ids(self): + # """ + # Retrieve all token IDs that correspond to subtokens. + + # Returns: + # set: A set of subtoken IDs. + # """ + # vocab = self.tokenizer.get_vocab() + # subtoken_ids = set() + # for token, idx in vocab.items(): + # if token.startswith(self.subtoken_prefix): + # subtoken_ids.add(idx) + # return subtoken_ids + + def _get_subtoken_ids(self): + """ + Retrieve all token IDs that correspond to subtokens. + + Returns: + list: A list of subtoken IDs. + """ + vocab = self.tokenizer.get_vocab() + subtoken_ids = [] + for token, idx in vocab.items(): + if token.startswith(self.subtoken_prefix): + subtoken_ids.append(idx) + return subtoken_ids # Changed from set to list + + + def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0): + tokens = self.tokenizer.tokenize(masked_sentence) + + for mask_pos in sorted(mask_logits_dict.keys()): + try: + # Get logits and squeeze extra dimension + mask_logits = torch.tensor(mask_logits_dict[mask_pos]).squeeze(0) # Remove the extra dimension + + # Create a mask for valid tokens (no special tokens, no subwords) + valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool) + for idx in range(len(mask_logits)): + token = self.tokenizer.convert_ids_to_tokens([idx])[0] + # Only allow regular words (no special tokens, no subwords) + if token.isalpha() and not token.startswith('[') and not token.startswith('##'): + valid_mask[idx] = True + + # Get valid logits + valid_logits = mask_logits[valid_mask] + valid_indices = torch.where(valid_mask)[0] + + if len(valid_logits) == 0: + print(f"Warning: No valid tokens found for position {mask_pos}") + continue + + if sampling_technique == "inverse_transform": + probs = torch.softmax(valid_logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item() + sampled_index = valid_indices[sampled_idx].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(valid_logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_idx = torch.argmax(random_probs * exp_probs).item() + sampled_index = valid_indices[sampled_idx].item() + + elif sampling_technique == "temperature": + valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8) + probs = torch.softmax(valid_logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8)) + probs = probs / torch.sum(probs) + sampled_idx = torch.multinomial(probs, 1)[0].item() + sampled_index = valid_indices[sampled_idx].item() + + elif sampling_technique == 'greedy': + sampled_idx = torch.argmax(valid_logits).item() + sampled_index = valid_indices[sampled_idx].item() + + # Replace mask with sampled token + sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0] + tokens[mask_pos] = sampled_token + + except Exception as e: + print(f"Error sampling for position {mask_pos}: {str(e)}") + continue + + return self.tokenizer.convert_tokens_to_string(tokens) + + + + def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0): + """ + Process all masked sentences in the results dictionary. + + Args: + results_dict (dict): Dictionary containing masked sentences and their logits + sampling_technique (str): Sampling method to use + temperature (float): Temperature parameter for sampling + + Returns: + dict: Dictionary containing original, masked, and sampled sentences + """ + processed_results = {} + + for original_sentence, data in results_dict.items(): + masked_sentence = data["masked_sentence"] + mask_logits = data["mask_logits"] + + sampled_sentence = self.sample_tokens( + mask_logits, + masked_sentence, + sampling_technique, + temperature + ) + + processed_results[original_sentence] = { + "masked_sentence": masked_sentence, + "sampled_sentence": sampled_sentence + } + + return processed_results + +if __name__ == "__main__": + sentences = [ + "The quick brown fox jumps over the lazy dog everyday.", + ] + result_dict = { + 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + } + + # First, mask the sentences + masking_processor = MaskingProcessor() + masking_results = masking_processor.process_sentences(sentences, result_dict) + + # Then, sample replacements for the masks + sampling_processor = SamplingProcessor(masking_processor.tokenizer) + + # Try different sampling techniques + sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"] + + for technique in sampling_techniques: + print(f"\nSampling using {technique}:") + sampled_results = sampling_processor.process_masked_sentences( + masking_results, + sampling_technique=technique, + temperature=1.0 + ) + + for original_sentence, result in sampled_results.items(): + print(f"Original: {original_sentence}") + print(f"Masked: {result['masked_sentence']}") + print(f"Sampled: {result['sampled_sentence']}") + print("---") + +# -------------------------------------------------------------------------------------------------- + # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100): + # words = masked_sentence.split() + # mask_positions = sorted(mask_logits_dict.keys()) + + # for mask_pos in mask_positions: + # mask_logits = torch.tensor(mask_logits_dict[mask_pos]) + + # try: + # if sampling_technique == "inverse_transform": + # probs = torch.softmax(mask_logits / temperature, dim=-1) + # cumulative_probs = torch.cumsum(probs, dim=-1) + # random_prob = random.random() + # sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + # elif sampling_technique == "exponential_minimum": + # probs = torch.softmax(mask_logits / temperature, dim=-1) + # exp_probs = torch.exp(-torch.log(probs)) + # random_probs = torch.rand_like(exp_probs) + # sampled_index = torch.argmax(random_probs * exp_probs).item() + + # elif sampling_technique == "temperature": + # mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8) + # probs = torch.softmax(mask_logits / temperature, dim=-1) + # if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + # raise ValueError("The computed probabilities contain NaN or inf values.") + # probs = torch.max(probs, torch.tensor(1e-8)) + # probs = probs / torch.sum(probs) + # sampled_index = torch.multinomial(probs, 1)[0].item() + + # elif sampling_technique == 'greedy': + # sampled_index = torch.argmax(mask_logits).item() + + # else: + # raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # # Replace mask with sampled token + # sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0] + # words[mask_pos] = sampled_token + + # except Exception as e: + # print(f"Error sampling for position {mask_pos}: {str(e)}") + # continue + + # return " ".join(words) + + ## MORE WEIRD RESULTS + # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100): + # words = masked_sentence.split() + # mask_positions = sorted(mask_logits_dict.keys()) + + # for mask_pos in mask_positions: + # mask_logits = torch.tensor(mask_logits_dict[mask_pos]) + + # try: + # # Create a mask for valid tokens (no special tokens, no subwords) + # valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool) + # for idx in range(len(mask_logits)): + # token = self.tokenizer.convert_ids_to_tokens([idx])[0] + # # Only allow regular words (no special tokens, no subwords) + # if token.isalpha() and not token.startswith('[') and not token.startswith('##'): + # valid_mask[idx] = True + + # # Get valid logits + # valid_logits = mask_logits[valid_mask] + # valid_indices = torch.where(valid_mask)[0] + + # if len(valid_logits) == 0: + # print(f"Warning: No valid tokens found for position {mask_pos}") + # continue + + # if sampling_technique == "inverse_transform": + # probs = torch.softmax(valid_logits / temperature, dim=-1) + # cumulative_probs = torch.cumsum(probs, dim=-1) + # random_prob = random.random() + # sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item() + # sampled_index = valid_indices[sampled_idx].item() + + # elif sampling_technique == "exponential_minimum": + # probs = torch.softmax(valid_logits / temperature, dim=-1) + # exp_probs = torch.exp(-torch.log(probs)) + # random_probs = torch.rand_like(exp_probs) + # sampled_idx = torch.argmax(random_probs * exp_probs).item() + # sampled_index = valid_indices[sampled_idx].item() + + # elif sampling_technique == "temperature": + # valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8) + # probs = torch.softmax(valid_logits / temperature, dim=-1) + # if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + # raise ValueError("The computed probabilities contain NaN or inf values.") + # probs = torch.max(probs, torch.tensor(1e-8)) + # probs = probs / torch.sum(probs) + # sampled_idx = torch.multinomial(probs, 1)[0].item() + # sampled_index = valid_indices[sampled_idx].item() + + # elif sampling_technique == 'greedy': + # sampled_idx = torch.argmax(valid_logits).item() + # sampled_index = valid_indices[sampled_idx].item() + + # else: + # raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # # Replace mask with sampled token + # sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0] + # words[mask_pos] = sampled_token + + # except Exception as e: + # print(f"Error sampling for position {mask_pos}: {str(e)}") + # continue + + # return " ".join(words) \ No newline at end of file diff --git a/utils/old/sampling/sampling_methods.py b/utils/old/sampling/sampling_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..0e1e2c8cd22a8edfa8116e1274c7dfbdc3c64bc3 --- /dev/null +++ b/utils/old/sampling/sampling_methods.py @@ -0,0 +1,291 @@ +from transformers import BertTokenizer, BertForMaskedLM +import torch +import random +from masking_methods import MaskingProcessor +from transformers import pipeline + +class SamplingProcessorWithModel: + def __init__(self, model_name='bert-base-uncased'): + self.tokenizer = BertTokenizer.from_pretrained(model_name) + self.model = BertForMaskedLM.from_pretrained(model_name) + self.model.eval() # Set the model to evaluation mode + + def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0): + """ + Fills each mask in the masked sentence using the specified sampling technique. + + Args: + masked_sentence (str): Sentence with [MASK] tokens. + sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy"). + temperature (float): Temperature parameter for sampling methods. + + Returns: + str: Sentence with the masks filled. + """ + input_ids = self.tokenizer.encode(masked_sentence, return_tensors="pt") + + while self.tokenizer.mask_token_id in input_ids[0]: + # Find indices of all [MASK] tokens + mask_indices = torch.where(input_ids == self.tokenizer.mask_token_id)[1] + + # Process the first [MASK] token in the sequence + mask_index = mask_indices[0].item() + + # Get logits from the model + with torch.no_grad(): + outputs = self.model(input_ids) + logits = outputs.logits + + # Extract logits for the [MASK] token + mask_logits = logits[0, mask_index] + + if sampling_technique == "inverse_transform": + probs = torch.softmax(mask_logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(mask_logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_index = torch.argmax(random_probs * exp_probs).item() + + elif sampling_technique == "temperature": + mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8) + probs = torch.softmax(mask_logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8, device=mask_logits.device)) + probs = probs / torch.sum(probs) + probs = probs.flatten() + if probs.size(0) > 1: + sampled_index = torch.multinomial(probs, 1).item() + else: + sampled_index = torch.argmax(probs).item() + + elif sampling_technique == 'greedy': + sampled_index = torch.argmax(mask_logits).item() + + else: + raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # Replace the first [MASK] with the selected token + input_ids[0, mask_index] = sampled_index + + return self.tokenizer.decode(input_ids[0], skip_special_tokens=True) + + def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0): + """ + Fills each mask in the masked sentence using the specified sampling technique. + + Args: + masked_sentence (str): Sentence with [MASK] tokens. + sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy"). + temperature (float): Temperature parameter for sampling methods. + + Returns: + str: Sentence with the masks filled. + """ + while '[MASK]' in masked_sentence: + # Get predictions for the first [MASK] + predictions = self.unmasker(masked_sentence) + + # Ensure predictions is a list of dictionaries + if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions): + raise ValueError("Unexpected structure in predictions from the pipeline.") + + # Extract logits (scores) from the predictions + logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32) + + if sampling_technique == "inverse_transform": + probs = torch.softmax(logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_index = torch.argmax(random_probs * exp_probs).item() + + elif sampling_technique == "temperature": + logits = torch.clamp(logits, min=-1e8, max=1e8) + probs = torch.softmax(logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8, device=logits.device)) + probs = probs / torch.sum(probs) + probs = probs.flatten() + if probs.size(0) > 1: + sampled_index = torch.multinomial(probs, 1).item() + else: + sampled_index = torch.argmax(probs).item() + + elif sampling_technique == 'greedy': + sampled_index = torch.argmax(logits).item() + + else: + raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # Replace the first [MASK] with the selected word + sampled_token = predictions[sampled_index]['token_str'] + masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1) + + return masked_sentence + + +# Example usage +if __name__ == "__main__": + from transformers import BertTokenizer + + # Define sentences and result_dict + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A quick brown dog outpaces a lazy fox.", + "Quick brown dog leaps over lazy the fox." + ] + result_dict = { + "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]}, + "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}, + "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]} + } + + masking_processor = MaskingProcessor() + masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False) + + # Use SamplingProcessor + sampling_processor = SamplingProcessorWithModel() + + # Iterate through masking results to apply sampling + for sentence, result in masking_results.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {result['masked_sentence']}") + masked_sentence = result["masked_sentence"] + + # Apply different sampling techniques + for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]: + print(f"Sampling Technique: {technique}") + filled_sentence = sampling_processor.fill_masked_sentence( + masked_sentence=masked_sentence, + sampling_technique=technique, + temperature=1.0 # Adjust temperature as needed + ) + print(f"Filled Sentence: {filled_sentence}\n") + print('--------------------------------') + + + +# from transformers import pipeline +# import torch +# import random +# from masking_methods import MaskingProcessor + + +# class SamplingProcessorWithPipeline: +# def __init__(self, model_name='bert-base-uncased'): +# self.unmasker = pipeline('fill-mask', model=model_name) +# self.tokenizer = self.unmasker.tokenizer + +# def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0): +# """ +# Fills each mask in the masked sentence using the specified sampling technique. + +# Args: +# masked_sentence (str): Sentence with [MASK] tokens. +# sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy"). +# temperature (float): Temperature parameter for sampling methods. + +# Returns: +# str: Sentence with the masks filled. +# """ +# while '[MASK]' in masked_sentence: +# # Get predictions for the first [MASK] +# predictions = self.unmasker(masked_sentence) +# print(f' predictions : {predictions}') +# print(f' type of predictions : {type(predictions)}') + +# # Ensure predictions is a list of dictionaries for the first [MASK] +# if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions): +# raise ValueError("Unexpected structure in predictions from the pipeline.") + +# # Extract logits (scores) from the predictions +# logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32) + +# if sampling_technique == "inverse_transform": +# probs = torch.softmax(logits / temperature, dim=-1) +# cumulative_probs = torch.cumsum(probs, dim=-1) +# random_prob = random.random() +# sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + +# elif sampling_technique == "exponential_minimum": +# probs = torch.softmax(logits / temperature, dim=-1) +# exp_probs = torch.exp(-torch.log(probs)) +# random_probs = torch.rand_like(exp_probs) +# sampled_index = torch.argmax(random_probs * exp_probs).item() + +# elif sampling_technique == "temperature": +# logits = torch.clamp(logits, min=-1e8, max=1e8) +# probs = torch.softmax(logits / temperature, dim=-1) +# if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): +# raise ValueError("The computed probabilities contain NaN or inf values.") +# probs = torch.max(probs, torch.tensor(1e-8, device=logits.device)) +# probs = probs / torch.sum(probs) +# probs = probs.flatten() +# if probs.size(0) > 1: +# sampled_index = torch.multinomial(probs, 1).item() +# else: +# sampled_index = torch.argmax(probs).item() + +# elif sampling_technique == 'greedy': +# sampled_index = torch.argmax(logits).item() + +# else: +# raise ValueError(f"Unknown sampling technique: {sampling_technique}") + +# # Replace the first [MASK] with the selected word +# sampled_token = predictions[sampled_index]['token_str'] +# masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1) + +# return masked_sentence + + +# # Example usage +# if __name__ == "__main__": +# from transformers import BertTokenizer + +# # Define sentences and result_dict +# sentences = [ +# "The quick brown fox jumps over the lazy dog.", +# "A quick brown dog outpaces a lazy fox.", +# "Quick brown animals leap over lazy obstacles." +# ] +# result_dict = { +# "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]}, +# "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]}, +# "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]} +# } + +# masking_processor = MaskingProcessor() +# masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False) + +# # Use SamplingProcessor +# sampling_processor = SamplingProcessorWithPipeline() + +# # Iterate through masking results to apply sampling +# for sentence, result in masking_results.items(): +# print(f"Original Sentence (Random): {sentence}") +# print(f"Masked Sentence (Random): {result['masked_sentence']}") +# masked_sentence = result["masked_sentence"] + +# # Apply different sampling techniques +# for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]: +# print(f"Sampling Technique: {technique}") +# filled_sentence = sampling_processor.fill_masked_sentence( +# masked_sentence=masked_sentence, +# sampling_technique=technique, +# temperature=1.0 # Adjust temperature as needed +# ) +# print(f"Filled Sentence: {filled_sentence}\n") +# print('--------------------------------') diff --git a/utils/old/sampling/sampling_methods_v1.py b/utils/old/sampling/sampling_methods_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b907c6c54dac0ac293d58e5c234d3fb7f34fc4 --- /dev/null +++ b/utils/old/sampling/sampling_methods_v1.py @@ -0,0 +1,146 @@ +import torch +import random +from masking_methods import MaskingProcessor + +class SamplingProcessor: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def fill_masked_sentence(self, original_sentence, mask_logits, sampling_technique, temperature=1.0): + """ + Fills each mask in the masked sentence using the specified sampling technique. + + Args: + original_sentence (str): The original masked sentence. + mask_logits (dict): Logits for each [MASK] token. + sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy"). + temperature (float): Temperature parameter for sampling methods. + + Returns: + str: Sentence with the masks filled. + """ + sentence_tokens = self.tokenizer.tokenize(original_sentence) + mask_token_indices = [i for i, token in enumerate(sentence_tokens) if token == self.tokenizer.mask_token] + + if len(mask_token_indices) != len(mask_logits): + raise ValueError("Mismatch between number of [MASK] tokens and logits provided.") + + for mask_idx, filtered_logits in zip(mask_token_indices, mask_logits.values()): + # Convert logits to a tensor + filtered_logits = torch.tensor(filtered_logits) + # filtered_logits, _ = torch.sort(filtered_logits, descending=True) + # print(f' type of filtered_logits : {type(filtered_logits)}') + # filtered_logits = filtered_logits[:5] + + if sampling_technique == "inverse_transform": + probs = torch.softmax(filtered_logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(filtered_logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_index = torch.argmax(random_probs * exp_probs).item() + + elif sampling_technique == "temperature": + filtered_logits = torch.clamp(filtered_logits, min=-1e8, max=1e8) + probs = torch.softmax(filtered_logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8, device=filtered_logits.device)) + probs = probs / torch.sum(probs) + probs = probs.flatten() + if probs.size(0) > 1: + sampled_index = torch.multinomial(probs, 1).item() + else: + sampled_index = torch.argmax(probs).item() + + elif sampling_technique == 'greedy': + sampled_index = torch.argmax(filtered_logits).item() + + else: + raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0] + sentence_tokens[mask_idx] = sampled_token + + return self.tokenizer.convert_tokens_to_string(sentence_tokens) + + + + def process_samples(self, masked_sentences, mask_logits, sampling_technique, temperature=1.0): + """ + Process multiple masked sentences and fill their masks using the specified sampling technique. + + Args: + masked_sentences (list): List of masked sentences. + mask_logits (dict): Logits for each [MASK] token in each sentence. + sampling_technique (str): Sampling technique to use. + temperature (float): Temperature parameter for sampling methods. + + Returns: + list: List of sentences with masks filled. + """ + filled_sentences = [] + for sentence, logits in zip(masked_sentences, mask_logits): + filled_sentence = self.fill_masked_sentence(sentence, logits, sampling_technique, temperature) + filled_sentences.append(filled_sentence) + return filled_sentences + +# Example usage +if __name__ == "__main__": + from transformers import BertTokenizer + + # tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking") + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + processor = SamplingProcessor(tokenizer) + + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A quick brown dog outpaces a lazy fox.", + "Quick brown dog leaps over lazy the fox." + ] + result_dict = { + "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]}, + "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}, + "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]} + } + + + masking_processor = MaskingProcessor() + masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False) + # masked_sentence = "The [MASK] brown fox jumps [MASK] the lazy dog." + # mask_logits = { + # 1: torch.randn(len(tokenizer)), # Example logits for first [MASK] + # 5: torch.randn(len(tokenizer)), # Example logits for second [MASK] + # } + + # Iterate through masking results to apply sampling + for sentence, result in masking_results.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {result['masked_sentence']}") + # print(f"Mask Logits (Random): {output['mask_logits']}") + print(f' type(result["mask_logits"]) : {type(result["mask_logits"])}') + print(f' length of result["mask_logits"] : {len(result["mask_logits"])}') + print(f' result["mask_logits"].keys() : {result["mask_logits"].keys()}') + masked_sentence = result["masked_sentence"] + mask_logits = result["mask_logits"] + + print(f"Original Masked Sentence: {masked_sentence}") + + # Apply different sampling techniques + for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]: + print(f"Sampling Technique: {technique}") + + # Fill the masks using the sampling processor + filled_sentence = processor.fill_masked_sentence( + original_sentence=masked_sentence, + mask_logits=mask_logits, + sampling_technique=technique, + temperature=1.0 # Adjust temperature as needed + ) + + print(f"Filled Sentence: {filled_sentence}\n") + print('--------------------------------') \ No newline at end of file diff --git a/utils/old/sampling/sampling_methods_v2.py b/utils/old/sampling/sampling_methods_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ff8400ff361b3fa5ba500385c12a62128d744b09 --- /dev/null +++ b/utils/old/sampling/sampling_methods_v2.py @@ -0,0 +1,112 @@ +from transformers import pipeline +import torch +import random +from masking_methods import MaskingProcessor + + +class SamplingProcessorWithPipeline: + def __init__(self, model_name='bert-base-uncased'): + self.unmasker = pipeline('fill-mask', model=model_name) + self.tokenizer = self.unmasker.tokenizer + + def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0): + """ + Fills each mask in the masked sentence using the specified sampling technique. + + Args: + masked_sentence (str): Sentence with [MASK] tokens. + sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy"). + temperature (float): Temperature parameter for sampling methods. + + Returns: + str: Sentence with the masks filled. + """ + while '[MASK]' in masked_sentence: + # Get predictions for the first [MASK] + predictions = self.unmasker(masked_sentence) + print(f' predictions : {predictions}') + print(f' type of predictions : {type(predictions)}') + + # Ensure predictions is a list of dictionaries + if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions): + raise ValueError("Unexpected structure in predictions from the pipeline.") + + # Extract logits (scores) from the predictions + logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32) + + if sampling_technique == "inverse_transform": + probs = torch.softmax(logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_index = torch.argmax(random_probs * exp_probs).item() + + elif sampling_technique == "temperature": + logits = torch.clamp(logits, min=-1e8, max=1e8) + probs = torch.softmax(logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8, device=logits.device)) + probs = probs / torch.sum(probs) + probs = probs.flatten() + if probs.size(0) > 1: + sampled_index = torch.multinomial(probs, 1).item() + else: + sampled_index = torch.argmax(probs).item() + + elif sampling_technique == 'greedy': + sampled_index = torch.argmax(logits).item() + + else: + raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # Replace the first [MASK] with the selected word + sampled_token = predictions[sampled_index]['token_str'] + masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1) + + return masked_sentence + + +# Example usage +if __name__ == "__main__": + from transformers import BertTokenizer + + # Define sentences and result_dict + sentences = [ + "The quick brown fox jumps over the lazy dog.", + "A quick brown dog outpaces a lazy fox.", + "Quick brown dog leaps over lazy the fox." + ] + result_dict = { + "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]}, + "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}, + "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]} + } + + masking_processor = MaskingProcessor() + masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False) + + # Use SamplingProcessor + sampling_processor = SamplingProcessorWithPipeline() + + # Iterate through masking results to apply sampling + for sentence, result in masking_results.items(): + print(f"Original Sentence (Random): {sentence}") + print(f"Masked Sentence (Random): {result['masked_sentence']}") + masked_sentence = result["masked_sentence"] + + # Apply different sampling techniques + for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]: + print(f"Sampling Technique: {technique}") + filled_sentence = sampling_processor.fill_masked_sentence( + masked_sentence=masked_sentence, + sampling_technique=technique, + temperature=1.0 # Adjust temperature as needed + ) + print(f"Filled Sentence: {filled_sentence}\n") + print('--------------------------------') diff --git a/utils/old/sampling_final_copy.py b/utils/old/sampling_final_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..d970bbc48e4aff0dccf12a27ba3673fc84555053 --- /dev/null +++ b/utils/old/sampling_final_copy.py @@ -0,0 +1,168 @@ +import torch +import random +from masking_methods import MaskingProcessor + +class SamplingProcessor: + def __init__(self, tokenizer): + """ + Initialize the SamplingProcessor. + + Args: + tokenizer: BERT tokenizer instance + """ + self.tokenizer = tokenizer + + def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0): + """ + Sample tokens for each mask in the sentence using the specified sampling technique. + + Args: + mask_logits_dict (dict): Dictionary of mask positions and their logits/tokens + masked_sentence (str): Sentence with [MASK] tokens + sampling_technique (str): Sampling method to use + temperature (float): Temperature parameter for sampling + + Returns: + str: Sentence with sampled tokens replacing masks + """ + words = masked_sentence.split() + + # Convert positions and logits to sorted list to process masks in order + mask_positions = sorted(mask_logits_dict.keys()) + + for mask_pos in mask_positions: + mask_data = mask_logits_dict[mask_pos] + mask_logits = torch.tensor(mask_data['logits']) + candidate_tokens = mask_data['tokens'] + + try: + if sampling_technique == "inverse_transform": + probs = torch.softmax(mask_logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(mask_logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_index = torch.argmax(random_probs * exp_probs).item() + + elif sampling_technique == "temperature": + mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8) + probs = torch.softmax(mask_logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8)) + probs = probs / torch.sum(probs) + probs = probs.flatten() + if probs.size(0) > 1: + sampled_index = torch.multinomial(probs, 1).item() + else: + sampled_index = torch.argmax(probs).item() + + elif sampling_technique == 'greedy': + sampled_index = torch.argmax(mask_logits).item() + + else: + raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # Use the sampled index to get the corresponding token + sampled_token = candidate_tokens[sampled_index] + # Remove ## if it's a subword token + sampled_token = sampled_token.replace('##', '') + words[mask_pos] = sampled_token + + except Exception as e: + print(f"Error sampling for position {mask_pos}: {str(e)}") + continue + + return " ".join(words) + + def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0): + """ + Process all masked sentences in the results dictionary. + + Args: + results_dict (dict): Dictionary containing masked sentences and their logits + sampling_technique (str): Sampling method to use + temperature (float): Temperature parameter for sampling + + Returns: + dict: Dictionary containing original, masked, and sampled sentences + """ + processed_results = {} + + for original_sentence, data in results_dict.items(): + masked_sentence = data["masked_sentence"] + mask_logits = data["mask_logits"] + + sampled_sentence = self.sample_tokens( + mask_logits, + masked_sentence, + sampling_technique, + temperature + ) + + processed_results[original_sentence] = { + "masked_sentence": masked_sentence, + "sampled_sentence": sampled_sentence + } + + return processed_results + + +if __name__ == "__main__": + sentences = [ + "The quick brown fox jumps over the lazy dog everyday.", + "A speedy brown fox jumps over a lazy dog.", + "A swift brown fox leaps over the lethargic dog." + + ] + result_dict ={ + 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]} + } + + # First, mask the sentences + masking_processor = MaskingProcessor() + masking_results = masking_processor.process_sentences(sentences, result_dict) + + # Then, sample replacements for the masks + sampling_processor = SamplingProcessor(masking_processor.tokenizer) + + # Try different sampling techniques + sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"] + + for technique in sampling_techniques: + print(f"\nSampling using {technique}:") + sampled_results = sampling_processor.process_masked_sentences( + masking_results, + sampling_technique=technique, + temperature=1.0 + ) + + ''' + { + "original_sentence_1": + { + "masked_sentence": "sentence with [MASK] tokens", + "sampling_method1": "sentence with sampled tokens", + }, + "original_sentence_2": + { + "masked_sentence": "sentence with [MASK] tokens", + "sampling_method": "sentence with sampled tokens" + }, + # ... and so on for each input sentence + }, + + ''' + + for original_sentence, result in sampled_results.items(): + print(f"Original: {original_sentence}") + print(f"Masked: {result['masked_sentence']}") + print(f"Sampled: {result['sampled_sentence']}") + print("---") + diff --git a/utils/paraphraser.py b/utils/paraphraser.py new file mode 100644 index 0000000000000000000000000000000000000000..daf1034b93d87c6e5bd8748719248efd9461f43c --- /dev/null +++ b/utils/paraphraser.py @@ -0,0 +1,75 @@ +""" +This file contains the code to generate paraphrases of sentences. +""" +import os +import sys +import logging +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from tqdm import tqdm # for progress bars +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from utils.config import load_config +# config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml') +# config = load_config(config_path)['PECCAVI_TEXT']['Paraphrase'] + +# Configure logging to show only warnings or above on the terminal. +logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +class Paraphraser: + """ + Paraphraser class to generate paraphrases of sentences. + """ + def __init__(self, config): + self.config = config + import torch + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + tqdm.write(f"[Paraphraser] Initializing on device: {self.device}") + self.tokenizer = AutoTokenizer.from_pretrained(config['tokenizer']) + self.model = AutoModelForSeq2SeqLM.from_pretrained(config['model']).to(self.device) + self.num_beams = config['num_beams'] + self.num_beam_groups = config['num_beam_groups'] + self.num_return_sequences = config['num_return_sequences'] + self.repetition_penalty = config['repetition_penalty'] + self.diversity_penalty = config['diversity_penalty'] + self.no_repeat_ngram_size = config['no_repeat_ngram_size'] + self.temperature = config['temperature'] + self.max_length = config['max_length'] + + def paraphrase(self, sentence: str, num_return_sequences: int=None, num_beams: int=None, num_beam_groups: int=None): + tqdm.write(f"[Paraphraser] Starting paraphrase for sentence: {sentence}") + if num_return_sequences is None: + num_return_sequences = self.num_return_sequences + if num_beams is None: + num_beams = self.num_beams + if num_beam_groups is None: + num_beam_groups = self.num_beam_groups + + inputs = self.tokenizer.encode("paraphrase: " + sentence, + return_tensors="pt", + max_length=self.max_length, + truncation=True).to(self.device) + outputs = self.model.generate( + inputs, + max_length=self.max_length, + num_beams=num_beams, + num_beam_groups=num_beam_groups, + num_return_sequences=num_return_sequences, + repetition_penalty=self.repetition_penalty, + diversity_penalty=self.diversity_penalty, + no_repeat_ngram_size=self.no_repeat_ngram_size, + temperature=self.temperature + ) + paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True) + for output in tqdm(outputs, desc="Decoding Paraphrases")] + tqdm.write(f"[Paraphraser] Paraphrase completed. {len(paraphrases)} outputs generated.") + return paraphrases + +if __name__ == "__main__": + config_path = '/home/jigyasu/PECCAVI-Text/utils/config.yaml' + config = load_config(config_path) + paraphraser = Paraphraser(config['PECCAVI_TEXT']['Paraphrase']) + sentence = "The quick brown fox jumps over the lazy dog." + paraphrases = paraphraser.paraphrase(sentence) + for paraphrase in paraphrases: + print(paraphrase) \ No newline at end of file diff --git a/utils/sampling.py b/utils/sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..b341e8bec8864e801fd025e12a5ad574728e7de6 --- /dev/null +++ b/utils/sampling.py @@ -0,0 +1,181 @@ +import torch +import random +import logging +from utils.masking_methods import MaskingProcessor +from tqdm import tqdm + +# Configure logging to suppress INFO-level messages on the console. +logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +class SamplingProcessor: + def __init__(self, tokenizer): + """ + Initialize the SamplingProcessor. + + Args: + tokenizer: BERT tokenizer instance + """ + self.tokenizer = tokenizer + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + tqdm.write(f"[SamplingProcessor] Initialized on device: {self.device}") + + def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0): + """ + Sample tokens for each mask in the sentence using the specified sampling technique. + + Args: + mask_logits_dict (dict): Dictionary of mask positions and their logits/tokens + masked_sentence (str): Sentence with [MASK] tokens + sampling_technique (str): Sampling method to use + temperature (float): Temperature parameter for sampling + + Returns: + str: Sentence with sampled tokens replacing masks + """ + tqdm.write(f"[SamplingProcessor] Sampling tokens for: {masked_sentence}") + print(f"[SamplingProcessor] Sampling tokens for: {masked_sentence}") + words = masked_sentence.split() + print(f"words: {words}") + # Convert positions and logits to sorted list to process masks in order + mask_positions = sorted(mask_logits_dict.keys()) + print(f"mask_positions: {mask_positions}") + + for mask_pos in mask_positions: + mask_data = mask_logits_dict[mask_pos] + # Move logits tensor to GPU + mask_logits = torch.tensor(mask_data['logits']).to(self.device) + candidate_tokens = mask_data['tokens'] + + try: + if sampling_technique == "inverse_transform": + probs = torch.softmax(mask_logits / temperature, dim=-1) + cumulative_probs = torch.cumsum(probs, dim=-1) + random_prob = random.random() + sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item() + + elif sampling_technique == "exponential_minimum": + probs = torch.softmax(mask_logits / temperature, dim=-1) + exp_probs = torch.exp(-torch.log(probs)) + random_probs = torch.rand_like(exp_probs) + sampled_index = torch.argmax(random_probs * exp_probs).item() + + elif sampling_technique == "temperature": + mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8) + probs = torch.softmax(mask_logits / temperature, dim=-1) + if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)): + raise ValueError("The computed probabilities contain NaN or inf values.") + probs = torch.max(probs, torch.tensor(1e-8).to(self.device)) + probs = probs / torch.sum(probs) + probs = probs.flatten() + if probs.size(0) > 1: + sampled_index = torch.multinomial(probs, 1).item() + else: + sampled_index = torch.argmax(probs).item() + + elif sampling_technique == 'greedy': + sampled_index = torch.argmax(mask_logits).item() + + else: + raise ValueError(f"Unknown sampling technique: {sampling_technique}") + + # Use the sampled index to get the corresponding token + sampled_token = candidate_tokens[sampled_index] + # Remove ## if it's a subword token + sampled_token = sampled_token.replace('##', '') + words[mask_pos] = sampled_token + logger.info(f"Sampled token '{sampled_token}' for mask position {mask_pos}.") + + except Exception as e: + logger.error(f"Error sampling for position {mask_pos}: {str(e)}") + continue + + sampled_sentence = " ".join(words) + tqdm.write(f"[SamplingProcessor] Sampled sentence: {sampled_sentence}") + return sampled_sentence + + def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0): + """ + Process all masked sentences in the results dictionary. + + Args: + results_dict (dict): Dictionary containing masked sentences and their logits + sampling_technique (str): Sampling method to use + temperature (float): Temperature parameter for sampling + + Returns: + dict: Dictionary containing original, masked, and sampled sentences + """ + tqdm.write("[SamplingProcessor] Starting sampling for masked sentences.") + processed_results = {} + # Wrap the iteration over each original sentence with tqdm + for original_sentence, data in tqdm(results_dict.items(), desc="Sampling Masked Sentences"): + masked_sentence = data["masked_sentence"] + mask_logits = data["mask_logits"] + + sampled_sentence = self.sample_tokens(mask_logits, + masked_sentence, + sampling_technique, + temperature) + processed_results[original_sentence] = { + "masked_sentence": masked_sentence, + "sampled_sentence": sampled_sentence + } + logger.info(f"Processed sampling for sentence: {original_sentence}") + tqdm.write("[SamplingProcessor] Completed sampling for all sentences.") + return processed_results + + +if __name__ == "__main__": + sentences = [ + "The quick brown fox jumps over the lazy dog everyday.", + "A speedy brown fox jumps over a lazy dog.", + "A swift brown fox leaps over the lethargic dog." + ] + result_dict = { + 'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}, + 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]} + } + + # First, mask the sentences + masking_processor = MaskingProcessor() + masking_results = masking_processor.process_sentences(sentences, result_dict) + + # Then, sample replacements for the masks + sampling_processor = SamplingProcessor(masking_processor.tokenizer) + + # Try different sampling techniques + sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"] + + for technique in sampling_techniques: + logger.info(f"Sampling using technique: {technique}") + sampled_results = sampling_processor.process_masked_sentences( + masking_results, + sampling_technique=technique, + temperature=1.0 + ) + + ''' + { + "original_sentence_1": + { + "masked_sentence": "sentence with [MASK] tokens", + "sampling_method1": "sentence with sampled tokens", + }, + "original_sentence_2": + { + "masked_sentence": "sentence with [MASK] tokens", + "sampling_method": "sentence with sampled tokens" + }, + # ... and so on for each input sentence + }, + + ''' + + for original_sentence, result in sampled_results.items(): + logger.info(f"Original: {original_sentence}") + logger.info(f"Masked: {result['masked_sentence']}") + logger.info(f"Sampled: {result['sampled_sentence']}") + logger.info("---") + diff --git a/utils/watermark.py b/utils/watermark.py new file mode 100644 index 0000000000000000000000000000000000000000..c5ff6d5cac0ea84fe073125943d02193c8c66ae9 --- /dev/null +++ b/utils/watermark.py @@ -0,0 +1,352 @@ +""" +This file contains the code to watermark given sentences using PECCAVI +""" +import os +import sys +import time +import random +import torch +from utils.paraphraser import Paraphraser +from utils.entailment import EntailmentAnalyzer +from utils.sampling import SamplingProcessor +# from tokenizer import tokenize_sentence, tokenize_sentences +from utils.non_melting_point import NgramProcessor +from utils.masking_methods import MaskingProcessor +from tqdm import tqdm # add this import at the top if not already present + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from renderers.highlighter import highlight_common_words,reparaphrased_sentences_html +from renderers.tree import generate_subplot1, generate_subplot2 +from renderers.plot_3d import gen_three_D_plot +# from metrics.detectability import SentenceDetectabilityCalculator +# from metrics.distortion import SentenceDistortionCalculator +# from metrics.euclidean_distance import SentenceEuclideanDistanceCalculator +from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM +from transformers import BertTokenizer, BertForMaskedLM +from pathlib import Path + + +from utils.config import load_config +import logging + +project_root = Path(__file__).parent.parent +config_path = project_root / "utils" / "config.yaml" + +# Update logging configuration to reduce clutter +logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +class Watermarker: + def __init__(self, config): + self.config = config + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + tqdm.write(f"[Watermarker] Initializing on device: {self.device}") + self.user_prompt = None + self.paraphrased_sentences = None + self.analyzed_paraphrased_sentences = None + self.selected_sentences = None + self.discarded_sentences = None + self.common_grams = None + # self.subsequences = None + self.common_grams_position = None + self.masked_sentences = None + self.masked_words = None + self.masked_logits = None + self.sampled_sentences = None + self.reparaphrased_sentences = None + self.distortion_list = None + self.detectability_list = None + self.euclidean_dist_list = None + + self.masking_strategies = ['random', 'pseudorandom','entropy'] + self.sampling_strategies = ['inverse_transform', 'exponential_minimum', 'temperature', 'greedy'] + self.masking_results = dict() + self.sampling_results = dict() + + # Move the model to GPU if available. + self.tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking") + self.model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking").to(self.device) + + self.paraphraser = Paraphraser(self.config['Paraphrase']) + self.entailment_analyzer = EntailmentAnalyzer(self.config['Entailment']) + self.ngram_processor = NgramProcessor() + self.masker = MaskingProcessor(self.tokenizer, self.model) + self.sampler = SamplingProcessor(self.tokenizer) + + # self.detectability_calculator = SentenceDetectabilityCalculator(self.config['Metrics']) + # self.distortion_calculator = SentenceDistortionCalculator(self.config['Metrics']) + # self.euclidean_distance_calculator = SentenceEuclideanDistanceCalculator(self.config['Metrics']) + + + def Paraphrase(self, prompt:str, threshold:int=0.7): + """ + This function paraphrases the given prompt using PECCAVI + Args: + prompt: str: The prompt to be paraphrased + threshold: int: The threshold for the similarity score + Returns: + str: The paraphrased sentence + """ + start_time = time.time() + self.user_prompt = prompt + self.paraphrased_sentences = self.paraphraser.paraphrase(self.user_prompt) + if self.paraphrased_sentences is None: + print("Error in generating paraphrases", "Error: Could not complete step") + return None + + self.analyzed_paraphrased_sentences, self.selected_sentences, self.discarded_sentences = self.entailment_analyzer.analyze_entailment(self.user_prompt, self.paraphrased_sentences, threshold) + + self.selected_sentences_list = [key for key in self.selected_sentences.keys()] + self.discarded_sentences_list = [key for key in self.discarded_sentences.keys()] + self.full_list = self.selected_sentences_list.copy() + self.full_list.extend(self.discarded_sentences_list) + self.full_list.append(self.user_prompt) + + + # self.user_prompt_tokenized = tokenize_sentence(self.user_prompt) + # self.selected_sentences_tokenized = tokenize_sentences(self.selected_sentences) + # self.discarded_sentences_tokenized = tokenize_sentences(self.discarded_sentences) + + # all_tokenized_sentences = [] + # all_tokenized_sentences.append(self.user_prompt_tokenized) + # all_tokenized_sentences.extend(self.selected_sentences_tokenized) + # all_tokenized_sentences.extend(self.discarded_sentences_tokenized) + + self.common_grams = self.ngram_processor.find_filtered_ngrams(self.full_list) + print(f"Common grams: {self.common_grams}") + + if self.user_prompt in self.full_list: + self.full_list.remove(self.user_prompt) + + # highlighted_user_prompt = highlight_common_words(self.common_grams, [self.user_prompt], "Highlighted LCS in the User Prompt") + # highlighted_accepted_sentences = highlight_common_words(self.common_grams, self.selected_sentences, "Highlighted LCS in the Accepted Sentences") + # highlighted_discarded_sentences = highlight_common_words(self.common_grams, self.discarded_sentences, "Highlighted LCS in the Discarded Sentences") + + execution_time = time.time() - start_time + time_info = f"Step 1 completed in {execution_time:.2f} seconds" + + # return [ + # highlighted_user_prompt, + # highlighted_accepted_sentences, + # highlighted_discarded_sentences, + # time_info + # ] + + def Masking(self) : + """ + For each masking strategy in self.masking_strategies, mask the sentences in self.selected_sentences_list + Return structure: + { + "": + { + "Original sentence 1": + { + "masked_sentence": "The sentence with appropriate [MASK] tokens", + "mask_logits": + { + 3: + { # Example: mask index 3 + "tokens": ["word1", "word2", ...], # Top predicted tokens + "logits": [score1, score2, ...] # Corresponding predicted scores + }, + 7: + { + "tokens": ["wordA", "wordB", ...], + "logits": [scoreA, scoreB, ...] + }, + # ... possibly additional mask positions + } + }, + "Original sentence 2": + { + "masked_sentence": "Another masked sentence", + "mask_logits": { ... } + }, + # ... more sentences processed for this strategy + }, + "": + { + # Similar structure for each original sentence processed with masking_strategy2 + }, + # ... additional masking strategies if defined in self.masking_strategies + } + """ + tqdm.write("[Watermarker] Starting Masking process.") + for strategy in self.masking_strategies: + tqdm.write(f"[Watermarker] Processing masking strategy: {strategy}") + results = self.masker.process_sentences(self.full_list, self.common_grams, strategy) + self.masking_results[strategy] = results + tqdm.write("[Watermarker] Masking process completed.") + return self.masking_results + + + def Sampling(self) : + """ + For each masking strategy in self.masking_results, sample a sentence from the + masked sentences using the given sampling strategy. + Return structure: + { + "inverse_transform (SAMPLING STRATEGY)": + { + "random (MASKING STRATEGY)": + { + "Original sentence 1": + { + "masked_sentence": "Masked version of sentence 1", + "sampled_sentence": "Sampled version of sentence 1" + }, + "Original sentence 2": + { + "masked_sentence": "Masked version of sentence 2", + "sampled_sentence": "Sampled version of sentence 2" + }, + # ... additional original sentences + }, + "pseudorandom": + { + # Similar structure for each original sentence + }, + "entropy": + { + # Similar structure for each original sentence + }, + }, + "exponential_minimum": + { + # Similar nested dictionaries for each masking strategy and original sentence + }, + "greedy": + { + # Similar nested dictionaries for each masking strategy and original sentence + } + } + """ + tqdm.write("[Watermarker] Starting Sampling process.") + for strategy in self.sampling_strategies: + tqdm.write(f"[Watermarker] Processing sampling strategy: {strategy}") + self.sampling_results[strategy] = {} + for mask_strategy in self.masking_strategies: + results = self.sampler.process_masked_sentences( + self.masking_results[mask_strategy], + sampling_technique=strategy, + temperature=1.0 + ) + self.sampling_results[strategy][mask_strategy] = results + tqdm.write("[Watermarker] Sampling process completed.") + return self.sampling_results + + def re_paraphrasing(self): + tqdm.write("[Watermarker] Starting re-paraphrasing process.") + self.reparaphrasing_results = {} + for sampling_strategy, mask_dict in tqdm(self.sampling_results.items(), desc="Sampling Strategies", leave=True): + self.reparaphrasing_results[sampling_strategy] = {} + for mask_strategy, sentences_data in tqdm(mask_dict.items(), desc="Masking Strategies", leave=False): + self.reparaphrasing_results[sampling_strategy][mask_strategy] = {} + for original_sentence, result in tqdm(sentences_data.items(), desc="Sentences", leave=False): + sampled_sentence = result.get("sampled_sentence", None) + if sampled_sentence: + new_paraphrases = self.paraphraser.paraphrase(sampled_sentence, + num_return_sequences=10, + num_beams=10) + else: + new_paraphrases = [] + self.reparaphrasing_results[sampling_strategy][mask_strategy][original_sentence] = { + "masking_strategy": mask_strategy, + "sampling_strategy": sampling_strategy, + "sampled_sentence": sampled_sentence, + "re_paraphrased_sentences": new_paraphrases + } + tqdm.write("[Watermarker] Re-paraphrasing process completed.") + return self.reparaphrasing_results + + def calculate_distortion(self): + return None + +if __name__ == "__main__": + # config_path = '/home/jigyasu/PECCAVI-Text/utils/config.yaml' + config = load_config(config_path)['PECCAVI_TEXT'] + watermarker = Watermarker(config) + + logger.info("Starting main Watermarker process.") + print("==> Paraphrasing:") + watermarker.Paraphrase("The quick brown fox jumps over small cat the lazy dog everyday again and again.") + logger.info("Paraphrasing completed.") + + # Prepare a list to accumulate result strings + results_str = [] + results_str.append("========== WATERMARKING RESULTS ==========\n\n") + + # --- Step 2: Common N-grams --- + results_str.append("==> Common N-grams:\n") + if watermarker.common_grams: + for ngram, positions in watermarker.common_grams.items(): + results_str.append(f" {ngram}: {positions}\n") + else: + results_str.append(" No common n-grams found.\n") + + # --- Step 3: Selected Sentences --- + results_str.append("\n==> Selected Sentences:\n") + if watermarker.selected_sentences: + for sentence in watermarker.selected_sentences: + results_str.append(f" {sentence}\n") + else: + results_str.append(" No selected sentences available.\n") + + # --- Step 4: Masking Results (without logits) --- + results_str.append("\n==> Masking Results:\n") + masking_results = watermarker.Masking() + for masking_strategy, results_dict in masking_results.items(): + results_str.append(f"\n-- Masking Strategy: {masking_strategy} --\n") + for original_sentence, data in results_dict.items(): + masked_sentence = data.get("masked_sentence", "") + results_str.append("Original:\n") + results_str.append(f" {original_sentence}\n") + results_str.append("Masked:\n") + results_str.append(f" {masked_sentence}\n") + results_str.append("-----\n") + + # --- Step 5: Sampling Results --- + results_str.append("\n==> Sampling Results:\n") + sampling_results = watermarker.Sampling() + for sampling_strategy, mask_strategy_dict in sampling_results.items(): + results_str.append(f"\n-- Sampling Strategy: {sampling_strategy} --\n") + for mask_strategy, sentences in mask_strategy_dict.items(): + results_str.append(f"\n Masking Strategy: {mask_strategy}\n") + for original_sentence, res in sentences.items(): + masked_sentence = res.get("masked_sentence", "") + sampled_sentence = res.get("sampled_sentence", "") + results_str.append(" Original:\n") + results_str.append(f" {original_sentence}\n") + results_str.append(" Masked:\n") + results_str.append(f" {masked_sentence}\n") + results_str.append(" Sampled:\n") + results_str.append(f" {sampled_sentence}\n") + results_str.append(" -----\n") + + # --- Step 6: Re-paraphrasing Results --- + results_str.append("\n==> Re-paraphrasing Results:\n") + reparaphrasing_results = watermarker.re_paraphrasing() + for sampling_strategy, mask_dict in reparaphrasing_results.items(): + results_str.append(f"\n-- Sampling Strategy: {sampling_strategy} --\n") + for mask_strategy, orig_sentence_dict in mask_dict.items(): + results_str.append(f"\n Masking Strategy: {mask_strategy}\n") + for original_sentence, data in orig_sentence_dict.items(): + sampled_sentence = data.get("sampled_sentence", "") + re_paraphrases = data.get("re_paraphrased_sentences", []) + results_str.append(" Original:\n") + results_str.append(f" {original_sentence}\n") + results_str.append(" Sampled:\n") + results_str.append(f" {sampled_sentence}\n") + results_str.append(" Re-paraphrased (first 3 examples):\n") + # Display only the first 3 re-paraphrases for brevity + for idx, rp in enumerate(re_paraphrases[:3]): + results_str.append(f" {idx+1}. {rp}\n") + results_str.append(" -----\n") + + # Write all results to the output file + output_file = "watermarking_results.txt" + with open(output_file, "w", encoding="utf-8") as f: + f.writelines(results_str) + + logger.info("Writing results to output file.") + print("\nResults have been written to", output_file) \ No newline at end of file diff --git a/utils/watermarking_results.txt b/utils/watermarking_results.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c6db638350a643fb4b6aa28b785fe36d8295bf3 --- /dev/null +++ b/utils/watermarking_results.txt @@ -0,0 +1,547 @@ +========== WATERMARKING RESULTS ========== + +==> Common N-grams: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis.: {'brown fox': [(2, 3)], 'dog': [(9, 9)], 'small': [(12, 12)]} + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend.: {'brown fox': [(3, 4)], 'dog': [(9, 9)], 'small': [(12, 12)]} + The quick brown fox jumps over small cat the lazy dog everyday again and again.: {'brown fox': [(2, 3)], 'dog': [(10, 10)], 'small': [(6, 6)]} + +==> Selected Sentences: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + +==> Masking Results: + +-- Masking Strategy: random -- +Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. +Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . +----- +Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. +Masked: + Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] . +----- +Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. +Masked: + The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again . +----- + +-- Masking Strategy: pseudorandom -- +Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. +Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . +----- +Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. +Masked: + [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] . +----- +Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. +Masked: + The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again . +----- + +-- Masking Strategy: entropy -- +Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. +Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . +----- +Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. +Masked: + [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] . +----- +Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. +Masked: + The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again . +----- + +==> Sampling Results: + +-- Sampling Strategy: inverse_transform -- + + Masking Strategy: random + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A massive brown fox frequently leaps over the neighborhood dog, who is small, on a regular basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] . + Sampled: + Repeatedly, the little brown fox leaps over an adult dog and its small body . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again . + Sampled: + The large brown fox jumps over the bucks the lazy dog everyday again and again . + ----- + + Masking Strategy: pseudorandom + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A big brown fox frequently leaps over the white dog, who is small, on a regular basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] . + Sampled: + There the rapid brown fox jumps over an inactive dog and its small owner . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again . + Sampled: + The gray brown fox jumps over the cat the lazy brown everyday again and again . + ----- + + Masking Strategy: entropy + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A huge brown fox frequently leaps over the guard dog, who is small, on a consistent basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] . + Sampled: + Suddenly the rapid brown fox leaps over an imaginary dog and its small tail . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again . + Sampled: + The big brown fox jumps over small cat the black dog everyday again and again . + ----- + +-- Sampling Strategy: exponential_minimum -- + + Masking Strategy: random + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A hunting brown fox frequently leaps over the domestic dog, who is small, on a large basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] . + Sampled: + Repeatedly, the smaller brown fox leaps over an unfortunate dog and its small collar . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again . + Sampled: + The night brown fox jumps over he bird the lazy dog everyday again and again . + ----- + + Masking Strategy: pseudorandom + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A male brown fox frequently leaps over the opposing dog, who is small, on a game basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] . + Sampled: + Only the rapid brown fox sits over an inactive dog and its small slave . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again . + Sampled: + The noisy brown fox jumps over he cat the lazy to everyday again and again . + ----- + + Masking Strategy: entropy + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A hunting brown fox frequently leaps over the domestic dog, who is small, on a game basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] . + Sampled: + With the rapid brown fox leaps over an enthusiastic dog and its small target . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again . + Sampled: + The brave brown fox jumps over small cat the grey dog everyday again and again . + ----- + +-- Sampling Strategy: greedy -- + + Masking Strategy: random + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + Repeatedly, the [MASK] brown fox leaps over an [MASK] dog and its small [MASK] . + Sampled: + Repeatedly, the great brown fox leaps over an old dog and its small owner . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over [MASK] [MASK] the lazy dog everyday again and again . + Sampled: + The big brown fox jumps over the rabbit the lazy dog everyday again and again . + ----- + + Masking Strategy: pseudorandom + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + [MASK] the rapid brown fox [MASK] over an inactive dog and its small [MASK] . + Sampled: + Suddenly the rapid brown fox jumps over an inactive dog and its small owner . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over [MASK] cat the lazy [MASK] everyday again and again . + Sampled: + The big brown fox jumps over the cat the lazy brown everyday again and again . + ----- + + Masking Strategy: entropy + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Masked: + A [MASK] brown fox frequently leaps over the [MASK] dog, who is small, on a [MASK] basis . + Sampled: + A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis . + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Masked: + [MASK] the rapid brown fox leaps over an [MASK] dog and its small [MASK] . + Sampled: + Suddenly the rapid brown fox leaps over an old dog and its small owner . + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Masked: + The [MASK] brown fox jumps over small cat the [MASK] dog everyday again and again . + Sampled: + The big brown fox jumps over small cat the small dog everyday again and again . + ----- + +==> Re-paraphrasing Results: + +-- Sampling Strategy: inverse_transform -- + + Masking Strategy: random + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A massive brown fox frequently leaps over the neighborhood dog, who is small, on a regular basis . + Re-paraphrased (first 3 examples): + 1. The small dog in the neighborhood is frequently jumped by a massive brown fox. + 2. A large brown fox frequently jumps over the small dog that lives nearby. + 3. The neighborhood dog, which is small, is frequently jumped by a massive brown fox. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Repeatedly, the little brown fox leaps over an adult dog and its small body . + Re-paraphrased (first 3 examples): + 1. The small brown fox repeatedly jumps over an adult dog and its tiny body. + 2. Repeatedly, the small brown fox jumps over an adult dog and its tiny body. + 3. On numerous occasions, the small brown fox leaps over an adult dog and its tiny body. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The large brown fox jumps over the bucks the lazy dog everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every day, the big brown fox jumps over the lazy dog's body. + 2. The oversized brown fox jumps over the lazy dog's body on a regular basis. + 3. A persistent action is the jumping of the large brown fox over the lazy dog's body. + ----- + + Masking Strategy: pseudorandom + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A big brown fox frequently leaps over the white dog, who is small, on a regular basis . + Re-paraphrased (first 3 examples): + 1. The small white dog is frequently jumped by a large brown fox. + 2. It is common for a large brown fox to jump over the small white dog. + 3. A large brown fox often jumps over the small white dog. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + There the rapid brown fox jumps over an inactive dog and its small owner . + Re-paraphrased (first 3 examples): + 1. A speedy brown fox leaps over a doggy and its small master. + 2. The swift brown fox leaps over an idle dog and its small owner. + 3. An agile brown fox swiftly leaps over a dour dog and its small master. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The gray brown fox jumps over the cat the lazy brown everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every day, the gray brown fox jumps over the lazy brown cat. + 2. On a regular basis, the gray brown fox jumps over the lazy brown cat. + 3. The gray brown fox repeatedly jumps over the lazy brown cat. + ----- + + Masking Strategy: entropy + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A huge brown fox frequently leaps over the guard dog, who is small, on a consistent basis . + Re-paraphrased (first 3 examples): + 1. The guard dog, which is small, is frequently jumped over by a massive brown fox. + 2. A small guard dog is regularly jumped by a massive brown fox. + 3. Every so often a large brown fox jumps over the small guard dog. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Suddenly the rapid brown fox leaps over an imaginary dog and its small tail . + Re-paraphrased (first 3 examples): + 1. The swift brown fox instinctively jumps over an imaginary dog and its tiny tail. + 2. Suddenly, the swift brown fox jumps over an imaginary dog and its tiny tail. + 3. In an instant, the swift brown fox jumps over a make-believe dog and its tiny tail. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The big brown fox jumps over small cat the black dog everyday again and again . + Re-paraphrased (first 3 examples): + 1. The big brown fox repeatedly jumps over the small cat and black dog. + 2. Every day, the large brown fox jumps over a small cat and black dog. + 3. On a regular basis, the large brown fox jumps over the little cat and black dog. + ----- + +-- Sampling Strategy: exponential_minimum -- + + Masking Strategy: random + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A hunting brown fox frequently leaps over the domestic dog, who is small, on a large basis . + Re-paraphrased (first 3 examples): + 1. The domestic dog, despite its size, is frequently leapt over by a brown fox engaged in hunting. + 2. A brown fox used for hunting is known to jump over the small domestic dog on large ground. + 3. The small domestic dog is frequently jumped over by a hunting brown fox on 'large grounds'. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Repeatedly, the smaller brown fox leaps over an unfortunate dog and its small collar . + Re-paraphrased (first 3 examples): + 1. The small brown fox repeatedly jumps over an unfortunate dog and its tiny collar. + 2. Repeatedly, the smaller brown fox jumps over an unfortunate dog and its small collar. + 3. On a regular basis, the smaller brown fox jumps over an unfortunate dog and its small collar. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The night brown fox jumps over he bird the lazy dog everyday again and again . + Re-paraphrased (first 3 examples): + 1. The bird the lazy dog is frequently jumped over by the night brown fox. + 2. Every so often, the brown fox of the night jumps over the bird's lazy dog. + 3. On a regular basis, the brown fox of night jumps over the lazy dog bird. + ----- + + Masking Strategy: pseudorandom + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A male brown fox frequently leaps over the opposing dog, who is small, on a game basis . + Re-paraphrased (first 3 examples): + 1. On occasion, a male brown fox jumps over the small dog that is on the other side as part of its game. + 2. The male brown fox is known to jump over the small dog in game on a regular basis. + 3. During game, a male brown fox frequently jumps over the small dog that is on the same side. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Only the rapid brown fox sits over an inactive dog and its small slave . + Re-paraphrased (first 3 examples): + 1. An inactive dog and its small slave are accompanied by only one swift brown fox. + 2. The only creature that sits on top of an inactive dog and its small slave is a swift brown fox. + 3. A slow brown fox is the sole entity leaning against an inactive dog and its small slave. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The noisy brown fox jumps over he cat the lazy to everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every once in a while, the noisy brown fox jumps over his lazy lazy cat. + 2. Each time he touches the lazy cat, the brown fox, who is loud, jumps over it. + 3. The shrieking brown fox jumps over his lazy lazy owner on a regular basis. + ----- + + Masking Strategy: entropy + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A hunting brown fox frequently leaps over the domestic dog, who is small, on a game basis . + Re-paraphrased (first 3 examples): + 1. The small domestic dog is frequently jumped over by a hunting brown fox when it's playing. + 2. A brown fox, which is often used for hunting purposes, jumps over a small domestic dog when it comes in contact with the game. + 3. For hunting purposes, a brown fox often jumps over the small domestic dog. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + With the rapid brown fox leaps over an enthusiastic dog and its small target . + Re-paraphrased (first 3 examples): + 1. The swift brown fox leaps over a lively dog and its tiny prey. + 2. A speedy brown fox leaps over a playful dog and its small prey. + 3. An eager dog and its small prey are swiftly spooked by the fast-moving brown fox. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The brave brown fox jumps over small cat the grey dog everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every day, the courageous brown fox jumps over a small cat and grey dog. + 2. The fearless brown fox repeatedly jumps over the small cat and grey dog. + 3. The courageous brown fox repeatedly jumps over the grey dog and small cat. + ----- + +-- Sampling Strategy: greedy -- + + Masking Strategy: random + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis . + Re-paraphrased (first 3 examples): + 1. The nearest dog is frequently swarmed by a big brown fox. + 2. It is common for a large brown fox to jump over the nearest dog, even though it is small. + 3. A large brown fox is known to jump over the nearest dog, which is a small canine. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Repeatedly, the great brown fox leaps over an old dog and its small owner . + Re-paraphrased (first 3 examples): + 1. The great brown fox repeatedly jumps over an elderly dog and its small owner. + 2. On numerous occasions, the great brown fox jumps over an elderly dog and its small owner. + 3. An elderly dog and its small owner are repeatedly jumped by the great brown fox. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The big brown fox jumps over the rabbit the lazy dog everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every day, the large brown fox jumps over the lazy dog and falls back down. + 2. On a regular basis, the large brown fox jumps over the lazy dog and rabbit. + 3. Each time the unruly rabbit and the lazy dog are spotted, the big brown fox jumps over them. + ----- + + Masking Strategy: pseudorandom + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis . + Re-paraphrased (first 3 examples): + 1. The nearest dog is frequently swarmed by a big brown fox. + 2. It is common for a large brown fox to jump over the nearest dog, even though it is small. + 3. A large brown fox is known to jump over the nearest dog, which is a small canine. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Suddenly the rapid brown fox jumps over an inactive dog and its small owner . + Re-paraphrased (first 3 examples): + 1. A speedy brown fox leaps over a doggy and its small owner without any prior thought. + 2. In an instant, the swift brown fox leaps over a doddery dog and its small owner. + 3. The swift brown fox leaps over an idle dog and its small owner without warning. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The big brown fox jumps over the cat the lazy brown everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every now and then, the large brown fox jumps over the lazy brown cat. + 2. On a regular basis, the large brown fox jumps over the lazy brown cat. + 3. The large brown fox repeatedly jumps over the lazy brown cat. + ----- + + Masking Strategy: entropy + Original: + A quick brown fox frequently leaps over the lazy dog, who is small, on a daily basis. + Sampled: + A large brown fox frequently leaps over the nearest dog, who is small, on a regular basis . + Re-paraphrased (first 3 examples): + 1. The nearest dog is frequently swarmed by a big brown fox. + 2. It is common for a large brown fox to jump over the nearest dog, even though it is small. + 3. A large brown fox is known to jump over the nearest dog, which is a small canine. + ----- + Original: + Repeatedly, the rapid brown fox leaps over an inactive dog and its small friend. + Sampled: + Suddenly the rapid brown fox leaps over an old dog and its small owner . + Re-paraphrased (first 3 examples): + 1. The brown fox, which is fast and agile, suddenly jumps over an elderly dog and its small owner. + 2. A speedy brown fox suddenly jumps over an old dog and its small owner. + 3. In an instant, the swift brown fox leaps over a small owner and an elderly dog. + ----- + Original: + The quick brown fox jumps over small cat the lazy dog everyday again and again. + Sampled: + The big brown fox jumps over small cat the small dog everyday again and again . + Re-paraphrased (first 3 examples): + 1. Every day, the big brown fox jumps over a small dog and cat. + 2. The big brown fox repeatedly jumps over the little cat and dog. + 3. Every now and then, the big brown fox jumps over a small dog or cat. + -----