Spaces:

peccavi
/

ai-text-watermarking-model

Sleeping

App Files Files Community

jgyasu commited on Apr 2

Commit

060ac52

1 Parent(s): 80bc0f8

Add entire pipeline

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

UI/__pycache__/gradio.cpython-310.pyc +0 -0
UI/__pycache__/gradio.cpython-311.pyc +0 -0
UI/gradio.py +516 -0
__pycache__/app.cpython-310.pyc +0 -0
app.py +21 -0
environment.yml +245 -0
metrics/distortion.py +370 -0
renderers/__pycache__/highlighter.cpython-310.pyc +0 -0
renderers/__pycache__/highlighter.cpython-311.pyc +0 -0
renderers/__pycache__/plot_3d.cpython-310.pyc +0 -0
renderers/__pycache__/plot_3d.cpython-311.pyc +0 -0
renderers/__pycache__/tree.cpython-310.pyc +0 -0
renderers/__pycache__/tree.cpython-311.pyc +0 -0
renderers/highlighter.py +162 -0
renderers/plot_3d.py +126 -0
renderers/tree.py +490 -0
utils/__init__.py +5 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/config.cpython-310.pyc +0 -0
utils/__pycache__/config.cpython-311.pyc +0 -0
utils/__pycache__/entailment.cpython-310.pyc +0 -0
utils/__pycache__/entailment.cpython-311.pyc +0 -0
utils/__pycache__/masking_methods.cpython-310.pyc +0 -0
utils/__pycache__/masking_methods.cpython-311.pyc +0 -0
utils/__pycache__/non_melting_point.cpython-310.pyc +0 -0
utils/__pycache__/non_melting_point.cpython-311.pyc +0 -0
utils/__pycache__/paraphraser.cpython-310.pyc +0 -0
utils/__pycache__/paraphraser.cpython-311.pyc +0 -0
utils/__pycache__/sampling.cpython-310.pyc +0 -0
utils/__pycache__/sampling.cpython-311.pyc +0 -0
utils/__pycache__/watermark.cpython-310.pyc +0 -0
utils/__pycache__/watermark.cpython-311.pyc +0 -0
utils/config.py +18 -0
utils/config.yaml +48 -0
utils/entailment.py +107 -0
utils/masking_methods.py +304 -0
utils/non_melting_point.py +137 -0
utils/old/masking/masking_methods.py +355 -0
utils/old/masking/masking_methods_new_work.py +447 -0
utils/old/masking/masking_methods_ok_working.py +257 -0
utils/old/masking/masking_methods_v1_working.py +233 -0
utils/old/masking_methods_final_copy.py +619 -0
utils/old/non_melting_points_v1.py +244 -0
utils/old/sampling/sampling.py +330 -0
utils/old/sampling/sampling_methods.py +291 -0
utils/old/sampling/sampling_methods_v1.py +146 -0
utils/old/sampling/sampling_methods_v2.py +112 -0
utils/old/sampling_final_copy.py +168 -0
utils/paraphraser.py +75 -0

UI/__pycache__/gradio.cpython-310.pyc ADDED Viewed

Binary file (6.61 kB). View file

UI/__pycache__/gradio.cpython-311.pyc ADDED Viewed

Binary file (27.3 kB). View file

UI/gradio.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import gradio as gr
+from utils.watermark import Watermarker
+from utils.config import load_config
+from renderers.highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
+from renderers.tree import generate_subplot1, generate_subplot2
+from pathlib import Path
+import time
+from typing import Dict, List, Tuple, Any
+import plotly.graph_objects as go
+class WatermarkerInterface:
+    def __init__(self, config):
+        self.pipeline = Watermarker(config)
+        self.common_grams = {}
+        self.highlight_info = []
+        self.masked_sentences = []
+    def handle_paraphrase(self, prompt: str) -> Tuple[str, str, str, str]:
+        """Wrapper for paraphrasing that includes highlighting"""
+        start_time = time.time()
+        # Run paraphrasing
+        self.pipeline.Paraphrase(prompt)
+        # Step 1: Process the original sentence first
+        seen_ngrams = {}  # Stores first occurrence index of each n-gram
+        original_indexed_ngrams = []  # Final indexed list for original
+        original_sentence = self.pipeline.user_prompt
+        original_ngrams = self.pipeline.common_grams.get(original_sentence, {})
+        # Step 1.1: Extract n-grams and their first occurrence index
+        ngram_occurrences = [
+            (min(indices, key=lambda x: x[0])[0], gram)  # Get first index
+            for gram, indices in original_ngrams.items()
+        ]
+        # Step 1.2: Sort n-grams based on their first occurrence
+        ngram_occurrences.sort()
+        # Step 1.3: Assign sequential indices
+        for idx, (position, gram) in enumerate(ngram_occurrences, start=1):
+            seen_ngrams[gram] = idx  # Assign sequential index
+            original_indexed_ngrams.append((idx, gram))
+        print("Original Indexed N-grams:", original_indexed_ngrams)
+        #generate highlight_info
+        colors = ["red", "blue", "green", "purple", "orange"]
+        highlight_info = [
+            (ngram, colors[i % len(colors)])
+            for i, (index, ngram) in enumerate(original_indexed_ngrams)
+        ]
+        common_grams = original_indexed_ngrams
+        self.highlight_info = highlight_info
+        self.common_grams = common_grams
+        # Step 2: Process paraphrased sentences and match indices
+        paraphrase_indexed_ngrams = {}
+        for sentence in self.pipeline.paraphrased_sentences:
+            sentence_ngrams = []  # Stores n-grams for this sentence
+            sentence_ngrams_dict = self.pipeline.common_grams.get(sentence, {})
+            for gram, indices in sentence_ngrams_dict.items():
+                first_occurrence = min(indices, key=lambda x: x[0])[0]
+                # Use the original's index if exists, otherwise assign a new one
+                if gram in seen_ngrams:
+                    index = seen_ngrams[gram]  # Use the same index as original
+                else:
+                    index = len(seen_ngrams) + 1  # Assign new index
+                    seen_ngrams[gram] = index  # Store it
+                sentence_ngrams.append((index, gram))
+            sentence_ngrams.sort()
+            paraphrase_indexed_ngrams[sentence] = sentence_ngrams
+        print("Paraphrase Indexed N-grams:", paraphrase_indexed_ngrams)
+        # Step 3: Generate highlighted versions using the renderer
+        highlighted_prompt = highlight_common_words(
+            common_grams,
+            [self.pipeline.user_prompt],
+            "Original Prompt with Highlighted Common Sequences"
+        )
+        highlighted_accepted = highlight_common_words_dict(
+            common_grams,
+            self.pipeline.selected_sentences,
+            "Accepted Paraphrased Sentences with Entailment Scores"
+        )
+        highlighted_discarded = highlight_common_words_dict(
+            common_grams,
+            self.pipeline.discarded_sentences,
+            "Discarded Paraphrased Sentences with Entailment Scores"
+        )
+        execution_time = f"<div class='execution-time'>Step 1 completed in {time.time() - start_time:.2f} seconds</div>"
+        self.highlight_info = highlight_info
+        self.common_grams = common_grams
+        return highlighted_prompt, highlighted_accepted, highlighted_discarded, execution_time
+    def handle_masking(self) -> Tuple[List[go.Figure], str]:
+        """Wrapper for masking that generates visualization trees"""
+        start_time = time.time()
+        masking_results = self.pipeline.Masking()
+        trees = []
+        highlight_info = self.highlight_info
+        common_grams = self.common_grams
+        sentence_to_masked = {}
+        # Create a consolidated figure with all strategies
+        original_sentence = None
+        # First pass - gather all sentences and strategies
+        for strategy, sentence_dict in masking_results.items():
+            for sent, data in sentence_dict.items():
+                if sent not in sentence_to_masked:
+                    sentence_to_masked[sent] = []
+                try:
+                    if not isinstance(data, dict):
+                        print(f"[ERROR] Data is not a dictionary for {sent} with strategy {strategy}")
+                        continue
+                    masked_sentence = data.get("masked_sentence", "")
+                    if masked_sentence:
+                        sentence_to_masked[sent].append((masked_sentence, strategy))
+                except Exception as e:
+                    print(f"Error processing {strategy} for sentence {sent}: {e}")
+        for original_sentence, masked_sentences_data in sentence_to_masked.items():
+            if not masked_sentences_data:
+                continue
+            masked_sentences = [ms[0] for ms in masked_sentences_data]
+            strategies = [ms[1] for ms in masked_sentences_data]
+            try:
+                fig = generate_subplot1(
+                    original_sentence,
+                    masked_sentences,
+                    strategies,
+                    highlight_info,
+                    common_grams
+                )
+                trees.append(fig)
+            except Exception as e:
+                print(f"Error generating multi-strategy tree: {e}")
+                trees.append(go.Figure())
+        # Pad with empty plots if needed
+        while len(trees) < 10:
+            trees.append(go.Figure())
+        execution_time = f"<div class='execution-time'>Step 2 completed in {time.time() - start_time:.2f} seconds</div>"
+        return trees[:10] + [execution_time]
+    def handle_sampling(self) -> Tuple[List[go.Figure], str]:
+        """Wrapper for sampling that generates visualization trees"""
+        start_time = time.time()
+        sampling_results = self.pipeline.Sampling()
+        trees = []
+        # Group sentences by original sentence
+        organized_results = {}
+        # Generate trees for each sampled sentence
+        for sampling_strategy, masking_dict in sampling_results.items():
+            for masking_strategy, sentences in masking_dict.items():
+                for original_sentence, data in sentences.items():
+                    if original_sentence not in organized_results:
+                        organized_results[original_sentence] = {}
+                    if masking_strategy not in organized_results[original_sentence]:
+                        organized_results[original_sentence][masking_strategy] = {
+                            "masked_sentence": data.get("masked_sentence", ""),  # Corrected reference
+                            "sampled_sentences": {}
+                        }
+                    # Add this sampling result
+                    organized_results[original_sentence][masking_strategy]["sampled_sentences"][sampling_strategy] = data.get("sampled_sentence", "")
+        for original_sentence, data in organized_results.items():
+            masked_sentences = []
+            all_sampled_sentences = []
+            for masking_strategy, masking_data in list(data.items())[:3]:  # Ensure this iteration is safe
+                masked_sentence = masking_data.get("masked_sentence", "")
+                if masked_sentence:
+                    masked_sentences.append(masked_sentence)
+                for sampling_strategy, sampled_sentence in masking_data.get("sampled_sentences", {}).items():
+                    if sampled_sentence:
+                        all_sampled_sentences.append(sampled_sentence)
+            if masked_sentences:
+                try:
+                    fig = generate_subplot2(
+                        masked_sentences,
+                        all_sampled_sentences,
+                        self.highlight_info,
+                        self.common_grams
+                    )
+                    trees.append(fig)
+                except Exception as e:
+                    print(f"Error generating subplot for {original_sentence}: {e}")
+                    trees.append(go.Figure())
+        while len(trees) < 10:
+            trees.append(go.Figure())
+        execution_time = f"<div class='execution-time'>Step 3 completed in {time.time() - start_time:.2f} seconds</div>"
+        return trees[:10] + [execution_time]
+    def handle_reparaphrasing(self) -> Tuple[List[str], str]:
+        """Wrapper for re-paraphrasing that formats results as HTML"""
+        start_time = time.time()
+        results = self.pipeline.re_paraphrasing()
+        html_outputs = []
+        # Generate HTML for each batch of re-paraphrased sentences
+        for sampling_strategy, masking_dict in results.items():
+            for masking_strategy, sentences in masking_dict.items():
+                for original_sent, data in sentences.items():
+                    if data["re_paraphrased_sentences"]:
+                        html = reparaphrased_sentences_html(data["re_paraphrased_sentences"])
+                        html_outputs.append(html)
+        # Pad with empty HTML if needed
+        while len(html_outputs) < 120:
+            html_outputs.append("")
+        execution_time = f"<div class='execution-time'>Step 4 completed in {time.time() - start_time:.2f} seconds</div>"
+        return html_outputs[:120] + [execution_time]
+def create_gradio_interface(config):
+    """Creates the Gradio interface with the updated pipeline"""
+    interface = WatermarkerInterface(config)
+    with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+        #CSS to enable scrolling for reparaphrased sentences and sampling plots
+        demo.css = """
+/* Set fixed height for the reparaphrased tabs container only */
+.gradio-container .tabs[id="reparaphrased-tabs"],
+.gradio-container .tabs[id="sampling-tabs"] {
+    overflow-x: hidden;
+    white-space: normal;
+    border-radius: 8px;
+    max-height: 600px; /* Set fixed height for the entire tabs component */
+    overflow-y: auto; /* Enable vertical scrolling inside the container */
+}
+/* Tab content styling for reparaphrased and sampling tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tabitem,
+.gradio-container .tabs[id="sampling-tabs"] .tabitem {
+    overflow-x: hidden;
+    white-space: normal;
+    display: block;
+    border-radius: 8px;
+}
+/* Make the tab navigation fixed at the top for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav {
+    display: flex;
+    overflow-x: auto;
+    white-space: nowrap;
+    scrollbar-width: thin;
+    border-radius: 8px;
+    scrollbar-color: #888 #f1f1f1;
+    position: sticky;
+    top: 0;
+    background: white;
+    z-index: 100;
+}
+/* Dropdown menu for scrollable tabs styling */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown {
+    position: relative;
+    display: inline-block;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content {
+    display: none;
+    position: absolute;
+    background-color: #f9f9f9;
+    min-width: 160px;
+    box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+    z-index: 1;
+    max-height: 300px;
+    overflow-y: auto;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content {
+    display: block;
+}
+/* Scrollbar styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar {
+    height: 8px;
+    border-radius: 8px;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-track,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-track {
+    background: #f1f1f1;
+    border-radius: 8px;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-thumb,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-thumb {
+    background: #888;
+    border-radius: 8px;
+}
+/* Tab button styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-item,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-item {
+    flex: 0 0 auto;
+    border-radius: 8px;
+}
+/* Plot container styling specifically for sampling tabs */
+.gradio-container .tabs[id="sampling-tabs"] .plot-container {
+    min-height: 600px;
+    max-height: 1800px;
+    overflow-y: auto;
+}
+/* Ensure text wraps in HTML components */
+.gradio-container .prose {
+    white-space: normal;
+    word-wrap: break-word;
+    overflow-wrap: break-word;
+}
+/* Dropdown button styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button {
+    background-color: #f0f0f0;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+    padding: 5px 10px;
+    cursor: pointer;
+    margin: 2px;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button:hover,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button:hover {
+    background-color: #e0e0e0;
+}
+/* Style dropdown content items for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div {
+    padding: 8px 12px;
+    cursor: pointer;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div:hover,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div:hover {
+    background-color: #e0e0e0;
+}
+/* Custom styling for execution time display */
+.execution-time {
+    text-align: right;
+    padding: 8px 16px;
+    font-family: inherit;
+    color: #555;
+    font-size: 0.9rem;
+    font-style: italic;
+    margin-left: auto;
+    width: 100%;
+    border-top: 1px solid #eee;
+    margin-top: 8px;
+}
+/* Layout for section headers with execution time */
+.section-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    width: 100%;
+    margin-bottom: 12px;
+}
+.section-header h3 {
+    margin: 0;
+}
+"""
+        gr.Markdown("# **AIISC Watermarking Model**")
+        with gr.Column():
+            gr.Markdown("## Input Prompt")
+            user_input = gr.Textbox(
+                label="Enter Your Prompt",
+                placeholder="Type your text here..."
+            )
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 1: Paraphrasing, LCS and Entailment Analysis")
+            with gr.Column(scale=1):
+                step1_time = gr.HTML()
+        paraphrase_button = gr.Button("Generate Paraphrases")
+        highlighted_user_prompt = gr.HTML(label="Highlighted User Prompt")
+        with gr.Tabs():
+            with gr.TabItem("Accepted Paraphrased Sentences"):
+                highlighted_accepted_sentences = gr.HTML()
+            with gr.TabItem("Discarded Paraphrased Sentences"):
+                highlighted_discarded_sentences = gr.HTML()
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 2: Where to Mask?")
+            with gr.Column(scale=1):
+                step2_time = gr.HTML()
+        masking_button = gr.Button("Apply Masking")
+        gr.Markdown("### Masked Sentence Trees")
+        tree1_plots = []
+        with gr.Tabs() as tree1_tabs:
+            for i in range(10):
+                with gr.TabItem(f"Masked Sentence {i+1}"):
+                    tree1 = gr.Plot()
+                    tree1_plots.append(tree1)
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 3: How to Mask?")
+            with gr.Column(scale=1):
+                step3_time = gr.HTML()
+        sampling_button = gr.Button("Sample Words")
+        gr.Markdown("### Sampled Sentence Trees")
+        tree2_plots = []
+        # Add elem_id to make this tab container scrollable
+        with gr.Tabs(elem_id="sampling-tabs") as tree2_tabs:
+            for i in range(10):
+                with gr.TabItem(f"Sampled Sentence {i+1}"):
+                    # Add a custom class to the container to enable proper styling
+                    with gr.Column(elem_classes=["plot-container"]):
+                        tree2 = gr.Plot()
+                        tree2_plots.append(tree2)
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 4: Re-paraphrasing")
+            with gr.Column(scale=1):
+                step4_time = gr.HTML()
+        reparaphrase_button = gr.Button("Re-paraphrase")
+        gr.Markdown("### Reparaphrased Sentences")
+        reparaphrased_sentences_tabs = []
+        with gr.Tabs(elem_id="reparaphrased-tabs") as reparaphrased_tabs:
+            for i in range(120):
+                with gr.TabItem(f"Reparaphrased Batch {i+1}"):
+                    reparaphrased_sent_html = gr.HTML()
+                    reparaphrased_sentences_tabs.append(reparaphrased_sent_html)
+        # Connect the interface functions to the buttons
+        paraphrase_button.click(
+            interface.handle_paraphrase,
+            inputs=user_input,
+            outputs=[
+                highlighted_user_prompt,
+                highlighted_accepted_sentences,
+                highlighted_discarded_sentences,
+                step1_time
+            ]
+        )
+        masking_button.click(
+            interface.handle_masking,
+            inputs=None,
+            outputs=tree1_plots + [step2_time]
+        )
+        sampling_button.click(
+            interface.handle_sampling,
+            inputs=None,
+            outputs=tree2_plots + [step3_time]
+        )
+        reparaphrase_button.click(
+            interface.handle_reparaphrasing,
+            inputs=None,
+            outputs=reparaphrased_sentences_tabs + [step4_time]
+        )
+    return demo
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+    config_path = project_root / "utils" / "config.yaml"
+    config = load_config(config_path)['PECCAVI_TEXT']
+    create_gradio_interface(config).launch()

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (747 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import gradio as gr
+from UI.gradio import create_gradio_interface
+from pathlib import Path
+from utils.config import load_config
+project_root = Path(__file__).resolve().parent
+config_path = project_root / "utils" / "config.yaml"
+config = load_config(config_path)['PECCAVI_TEXT']
+def main():
+    """
+    This function is the entry point for the PECCAVI Watermarking Model.
+    It creates the Gradio interface for the model and runs it.
+    """
+    create_gradio_interface(config).launch()
+if __name__ == "__main__":
+    main()

environment.yml ADDED Viewed

	@@ -0,0 +1,245 @@

+name: panda
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - asttokens=2.4.1=pyhd8ed1ab_0
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.8.30=hbcca054_0
+  - comm=0.2.2=pyhd8ed1ab_0
+  - debugpy=1.8.6=py310hf71b8c6_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - exceptiongroup=1.2.2=pyhd8ed1ab_0
+  - executing=2.1.0=pyhd8ed1ab_0
+  - ipykernel=6.29.5=pyh3099207_0
+  - ipython=8.27.0=pyh707e725_0
+  - jedi=0.19.1=pyhd8ed1ab_0
+  - jupyter_client=8.6.3=pyhd8ed1ab_0
+  - jupyter_core=5.7.2=pyh31011fe_1
+  - krb5=1.21.3=h143b758_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - libedit=3.1.20230828=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=14.1.0=h77fa898_1
+  - libgcc-ng=14.1.0=h69a702a_1
+  - libgomp=14.1.0=h77fa898_1
+  - libsodium=1.0.20=h4ab18f5_0
+  - libstdcxx=14.1.0=hc0a3c3a_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_0
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_0
+  - openssl=3.3.2=hb9d3cd8_0
+  - packaging=24.1=pyhd8ed1ab_0
+  - parso=0.8.4=pyhd8ed1ab_0
+  - pexpect=4.9.0=pyhd8ed1ab_0
+  - pickleshare=0.7.5=py_1003
+  - pip=24.2=py310h06a4308_0
+  - platformdirs=4.3.6=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.48=pyha770c72_0
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.3=pyhd8ed1ab_0
+  - pygments=2.18.0=pyhd8ed1ab_0
+  - python=3.10.14=h955ad1f_1
+  - python_abi=3.10=2_cp310
+  - pyzmq=26.2.0=py310h71f11fc_2
+  - readline=8.2=h5eee18b_0
+  - setuptools=75.1.0=py310h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - tk=8.6.14=h39e8969_0
+  - tornado=6.4.1=py310ha75aee5_1
+  - traitlets=5.14.3=pyhd8ed1ab_0
+  - typing_extensions=4.12.2=pyha770c72_0
+  - wcwidth=0.2.13=pyhd8ed1ab_0
+  - wheel=0.44.0=py310h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - zeromq=4.3.5=ha4adb4c_5
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - absl-py==2.1.0
+      - accelerate==0.33.0
+      - aiofiles==23.2.1
+      - aiohappyeyeballs==2.3.5
+      - aiohttp==3.10.3
+      - aiosignal==1.3.1
+      - altgraph==0.17.4
+      - annotated-types==0.7.0
+      - anyio==4.6.0
+      - astunparse==1.6.3
+      - async-timeout==4.0.3
+      - attrs==24.2.0
+      - av==12.0.0
+      - backports-tarfile==1.2.0
+      - beautifulsoup4==4.12.3
+      - build==1.2.2
+      - cachetools==5.5.0
+      - certifi==2024.7.4
+      - cffi==1.17.1
+      - charset-normalizer==3.3.2
+      - clean-fid==0.1.35
+      - click==8.1.7
+      - colorama==0.4.6
+      - contextlib2==21.6.0
+      - contourpy==1.2.1
+      - cryptography==43.0.1
+      - cycler==0.12.1
+      - datasets==2.21.0
+      - diffusers==0.27.2
+      - dill==0.3.8
+      - docker-pycreds==0.4.0
+      - docutils==0.21.2
+      - fastapi==0.115.0
+      - ffmpy==0.4.0
+      - filelock==3.15.4
+      - flatbuffers==24.3.25
+      - fonttools==4.53.1
+      - frozenlist==1.4.1
+      - fsspec==2024.6.1
+      - gast==0.4.0
+      - gdown==5.2.0
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - google-auth==2.35.0
+      - google-auth-oauthlib==0.4.6
+      - google-pasta==0.2.0
+      - gradio==4.44.0
+      - gradio-client==1.3.0
+      - grpcio==1.65.4
+      - h11==0.14.0
+      - h5py==3.11.0
+      - httpcore==1.0.6
+      - httpx==0.27.2
+      - huggingface-hub==0.25.2
+      - idna==3.7
+      - imageio==2.35.0
+      - importlib-metadata==8.2.0
+      - importlib-resources==6.4.5
+      - jaraco-classes==3.4.0
+      - jaraco-context==6.0.1
+      - jaraco-functools==4.1.0
+      - jeepney==0.8.0
+      - jinja2==3.1.4
+      - joblib==1.4.2
+      - json-with-comments==1.2.7
+      - keras==3.5.0
+      - keras-preprocessing==1.1.2
+      - keyring==25.4.1
+      - kiwisolver==1.4.5
+      - kornia==0.7.4
+      - kornia-rs==0.1.7
+      - lazy-loader==0.4
+      - libclang==18.1.1
+      - markdown==3.6
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - matplotlib==3.9.2
+      - mdurl==0.1.2
+      - ml-collections==0.1.1
+      - ml-dtypes==0.4.0
+      - more-itertools==10.5.0
+      - multidict==6.0.5
+      - multiprocess==0.70.16
+      - namex==0.0.8
+      - networkx==3.3
+      - nh3==0.2.18
+      - nltk==3.9.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu11==11.10.3.66
+      - nvidia-cuda-nvrtc-cu11==11.7.99
+      - nvidia-cuda-runtime-cu11==11.7.99
+      - nvidia-cudnn-cu11==8.5.0.96
+      - oauthlib==3.2.2
+      - opencv-python==4.10.0.84
+      - opencv-python-headless==4.10.0.84
+      - opt-einsum==3.3.0
+      - optree==0.12.1
+      - orjson==3.10.7
+      - pandas==2.2.2
+      - pillow==10.4.0
+      - pkginfo==1.10.0
+      - plotly==5.24.1
+      - protobuf==4.25.5
+      - psutil==5.9.8
+      - pyarrow==17.0.0
+      - pyasn1==0.6.1
+      - pyasn1-modules==0.4.1
+      - pycparser==2.22
+      - pydantic==2.9.2
+      - pydantic-core==2.23.4
+      - pydub==0.25.1
+      - pyinstaller==6.10.0
+      - pyinstaller-hooks-contrib==2024.8
+      - pyparsing==3.1.2
+      - pyproject-hooks==1.1.0
+      - pysocks==1.7.1
+      - python-dateutil==2.9.0.post0
+      - python-multipart==0.0.12
+      - pytorch-msssim==1.0.0
+      - pytorchcv==0.0.73
+      - pytz==2023.3.post1
+      - pyyaml==6.0.2
+      - readme-renderer==44.0
+      - regex==2024.7.24
+      - requests==2.32.3
+      - requests-oauthlib==2.0.0
+      - requests-toolbelt==1.0.0
+      - rfc3986==2.0.0
+      - rich==13.7.1
+      - rsa==4.9
+      - ruff==0.6.9
+      - safetensors==0.4.4
+      - saliency==0.2.1
+      - scikit-image==0.24.0
+      - scikit-learn==1.6.0
+      - scipy==1.14.0
+      - secretstorage==3.3.3
+      - semantic-version==2.10.0
+      - sentence-transformers==3.3.1
+      - sentry-sdk==2.15.0
+      - setproctitle==1.3.3
+      - shapely==2.0.5
+      - shellingham==1.5.4
+      - six==1.12.0
+      - smmap==5.0.1
+      - sniffio==1.3.1
+      - soupsieve==2.6
+      - spaces==0.30.2
+      - starlette==0.38.6
+      - tenacity==9.0.0
+      - tensorboard==2.17.1
+      - tensorboard-data-server==0.7.2
+      - tensorboard-plugin-wit==1.8.1
+      - tensorflow==2.17.0
+      - tensorflow-estimator==2.10.0
+      - tensorflow-hub==0.16.1
+      - tensorflow-intel==0.0.1
+      - tensorflow-io-gcs-filesystem==0.31.0
+      - termcolor==1.1.0
+      - tf-keras==2.17.0
+      - threadpoolctl==3.5.0
+      - tifffile==2024.8.10
+      - timm==1.0.10
+      - tokenizers==0.19.1
+      - tomli==2.0.1
+      - tomlkit==0.12.0
+      - torch==1.13.1
+      - torchvision==0.14.1
+      - tqdm==4.66.5
+      - transformers==4.43.3
+      - twine==5.1.1
+      - typer==0.12.5
+      - tzdata==2024.1
+      - urllib3==2.2.2
+      - uvicorn==0.31.0
+      - wandb==0.18.3
+      - websockets==12.0
+      - werkzeug==3.0.4
+      - wrapt==1.11.2
+      - xxhash==3.4.1
+      - yarl==1.9.4
+      - zipp==3.20.0
+prefix: /home/ashhar21137/miniconda3/envs/panda

metrics/distortion.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import os
+import sys
+from tqdm import tqdm
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from bert_score import BERTScorer
+from bert_score.utils import model2layers
+from nltk.tokenize import word_tokenize
+from Levenshtein import distance as levenshtein_distance
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from scipy.spatial.distance import cdist
+from scipy.optimize import linear_sum_assignment
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from config.config import load_config
+config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+config = load_config(config_path)['PECCAVI_TEXT']['Metrics']
+class SentenceDistortionCalculator:
+    """
+    A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
+    """
+    def __init__(self, config, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of modified sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        self.levenshtein_distances = {}
+        self.bert_scores = {}
+        self.mover_scores = {}
+        self.normalized_levenshtein = {}
+        self.normalized_bert_scores = {}
+        self.normalized_mover_scores = {}
+        self.combined_distortions = {}
+        self.tokenizer = GPT2TokenizerFast.from_pretrained(config['Distortion'])
+        self.model = GPT2LMHeadModel.from_pretrained(config['Distortion'])
+        self.model.eval()
+    def calculate_all_metrics(self):
+        """
+        Calculate all distortion metrics for each modified sentence.
+        """
+        for idx, modified_sentence in tqdm(enumerate(self.paraphrased_sentences), total=len(self.paraphrased_sentences), desc="Calculating Metrics"):
+            key = f"Sentence_{idx+1}"
+            self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence)
+            self.bert_scores[key] = self._calculate_bert_score(modified_sentence)
+            self.mover_scores[key] = self._calculate_mover_score(modified_sentence)
+    def normalize_metrics(self):
+        """
+        Normalize all metrics to be between 0 and 1.
+        """
+        for _ in tqdm(range(1), desc="Normalizing Metrics"):  # Add tqdm here (wrap the normalization process)
+            self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances)
+            self.normalized_bert_scores = self._normalize_dict(self.bert_scores)
+            self.normalized_mover_scores = self._normalize_dict(self.mover_scores)
+    def calculate_combined_distortion(self):
+        """
+        Calculate the combined distortion using the root mean square of the normalized metrics.
+        """
+        for _ in tqdm(range(1), desc="Calculating Combined Distortion"):  # Add tqdm here
+            for key in self.normalized_levenshtein.keys():
+                rms = np.sqrt(
+                    (
+                        self.normalized_levenshtein[key] ** 2 +
+                        self.normalized_bert_scores[key] ** 2+
+                        self.normalized_mover_scores[key] **2
+                    ) / 3
+                )
+                self.combined_distortions[key] = rms
+    def plot_metrics(self):
+        """
+        Plot each normalized metric and the combined distortion in separate graphs.
+        """
+        keys = list(self.normalized_levenshtein.keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        metrics = {
+            'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys],
+            'BERTScore': [self.normalized_bert_scores[key] for key in keys],
+            'MOVERscore':[self.normalized_mover_scores[key] for key in keys],
+            'Combined Distortion': [self.combined_distortions[key] for key in keys]
+        }
+        # Plot each metric separately
+        for metric_name, values in tqdm(metrics.items(), desc="Plotting Metrics"):  # Add tqdm here
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name}')
+            plt.grid(True)
+            plt.tight_layout()
+            plt.show()
+    def _calculate_levenshtein_distance(self, modified_sentence):
+        """
+        Calculate the word-level Levenshtein distance between the original and modified sentence.
+        """
+        words1 = word_tokenize(self.original_sentence)
+        words2 = word_tokenize(modified_sentence)
+        lev_distance = levenshtein_distance(words1, words2)
+        return (lev_distance / max(len(words1), len(words2)))
+    def _calculate_bert_score(self, modified_sentence):
+        """
+        Compute the BERTScore similarity between the original and modified sentence.
+        Returns 1 - F1 score to represent dissimilarity.
+        """
+        if not hasattr(self, 'original_sentence'):
+            raise ValueError("original_sentence is not set. Please set self.original_sentence before calling this function.")
+        if not isinstance(modified_sentence, str):
+            raise ValueError("modified_sentence must be a string.")
+        model_type = "microsoft/deberta-xlarge-mnli"
+        num_layers = model2layers[model_type]
+        if not hasattr(self, "cached_bertscorer"):
+            self.cached_bertscorer = BERTScorer(
+                model_type=model_type,
+                num_layers=num_layers,
+                batch_size=1,  # Single sentence comparison
+                nthreads=4,
+                all_layers=False,
+                idf=False,
+                device="cuda" if torch.cuda.is_available() else "cpu",
+                lang="en"
+            )
+        # Compute BERTScore
+        _, _, F1 = self.cached_bertscorer.score(
+            cands=[modified_sentence],
+            refs=[self.original_sentence],
+            verbose=False,
+            batch_size=1
+        )
+        return 1 - F1.item()  # Return dissimilarity score
+    def _calculate_mover_score(self,modified_sentence,model_name='all-MiniLM-L6-v2'):
+        """Compute MoverScore correctly using word-level embeddings."""
+        if not self.original_sentence:
+            raise ValueError("Original sentence not provided.")
+        # Tokenize sentences
+        original_tokens = self.original_sentence.split()
+        modified_tokens = modified_sentence.split()
+        model = SentenceTransformer(model_name)
+        # Compute word embeddings
+        original_embeddings = model.encode(original_tokens, convert_to_numpy=True)
+        modified_embeddings = model.encode(modified_tokens, convert_to_numpy=True)
+        # Compute cost matrix (cosine distance)
+        cost_matrix = cdist(original_embeddings, modified_embeddings, metric='cosine')
+        # Solve optimal transport problem (Hungarian Algorithm)
+        row_ind, col_ind = linear_sum_assignment(cost_matrix)
+        # Compute IDF weights
+        vectorizer = TfidfVectorizer()
+        vectorizer.fit([self.original_sentence, modified_sentence])
+        idf_values = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
+        # Apply IDF weighting to aligned word pairs
+        idf_weights_original = np.array([idf_values.get(word.lower(), 1.0) for word in original_tokens])
+        idf_weights_modified = np.array([idf_values.get(word.lower(), 1.0) for word in modified_tokens])
+        combined_idf_weights = (idf_weights_original[row_ind] + idf_weights_modified[col_ind]) / 2
+        weighted_score = np.sum((1 - cost_matrix[row_ind, col_ind]) * combined_idf_weights) / np.sum(combined_idf_weights)
+        return 1-weighted_score  # Higher score = more dissimilar
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        if max_val - min_val == 0:
+            normalized_values = np.zeros_like(values)
+        else:
+            normalized_values = (values - min_val) / (max_val - min_val)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    def get_normalized_metrics(self):
+        """
+        Get all normalized metrics as a dictionary.
+        """
+        return {
+            'Min Edit Distance': self.normalized_levenshtein,
+            'BERTScore': self.normalized_bert_scores,
+            'Mover Score': self.normalized_mover_scores
+        }
+    def get_combined_distortions(self):
+        """
+        Get the dictionary of combined distortion values.
+        """
+        return self.combined_distortions
+# Example usage
+if __name__ == "__main__":
+    config = load_config(config_path)['PECCAVI_TEXT']['Metrics']
+    # Original sentence
+    original_sentence = "The quick brown fox jumps over the lazy dog"
+    # Paraphrased sentences
+    paraphrased_sentences = [
+    # Original 1: "A swift auburn fox leaps across a sleepy canine."
+    "The swift auburn fox leaps across a sleepy canine.",
+    "A quick auburn fox leaps across a sleepy canine.",
+    "A swift ginger fox leaps across a sleepy canine.",
+    "A swift auburn fox bounds across a sleepy canine.",
+    "A swift auburn fox leaps across a tired canine.",
+    "Three swift auburn foxes leap across a sleepy canine.",
+    "The vulpine specimen rapidly traverses over a dormant dog.",
+    "Like lightning, the russet hunter soars over the drowsy guardian.",
+    "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+    "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+    "A swift auburn predator navigates across a lethargic pet.",
+    "Subject A (fox) demonstrates velocity over Subject B (dog).",
+    # Original 2: "The agile russet fox bounds over an idle hound."
+    "Some agile russet foxes bound over an idle hound.",
+    "The nimble russet fox bounds over an idle hound.",
+    "The agile brown fox bounds over an idle hound.",
+    "The agile russet fox jumps over an idle hound.",
+    "The agile russet fox bounds over a lazy hound.",
+    "Two agile russet foxes bound over an idle hound.",
+    "A dexterous vulpine surpasses a stationary canine.",
+    "Quick as thought, the copper warrior sails over the guardian.",
+    "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+    "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+    "An agile russet hunter maneuvers above a resting hound.",
+    "Test subject F-1 achieves displacement superior to subject D-1.",
+    # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+    "The nimble mahogany vulpine vaults above a drowsy dog.",
+    "A swift mahogany vulpine vaults above a drowsy dog.",
+    "A nimble reddish vulpine vaults above a drowsy dog.",
+    "A nimble mahogany fox vaults above a drowsy dog.",
+    "A nimble mahogany vulpine leaps above a drowsy dog.",
+    "Four nimble mahogany vulpines vault above a drowsy dog.",
+    "An agile specimen of reddish fur surpasses a somnolent canine.",
+    "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+    "Tha quick brown beastie jumps o'er the tired pup, aye.",
+    "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+    "A nimble rust-colored predator crosses above a drowsy pet.",
+    "Observed: Subject Red executes vertical motion over Subject Gray.",
+    # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+    "A speedy copper-colored fox hops over the lethargic pup.",
+    "The quick copper-colored fox hops over the lethargic pup.",
+    "The speedy bronze fox hops over the lethargic pup.",
+    "The speedy copper-colored fox jumps over the lethargic pup.",
+    "The speedy copper-colored fox hops over the tired pup.",
+    "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+    "A rapid vulpine of bronze hue traverses an inactive young canine.",
+    "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+    "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+    "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+    "A fleet copper-toned predator moves past a sluggish young dog.",
+    "Field note: Adult fox subject exceeds puppy subject vertically.",
+    # Original 5: "A rapid tawny fox springs over a sluggish dog."
+    "The rapid tawny fox springs over a sluggish dog.",
+    "A quick tawny fox springs over a sluggish dog.",
+    "A rapid golden fox springs over a sluggish dog.",
+    "A rapid tawny fox jumps over a sluggish dog.",
+    "A rapid tawny fox springs over a lazy dog.",
+    "Six rapid tawny foxes spring over a sluggish dog.",
+    "An expeditious yellowish vulpine surpasses a torpid canine.",
+    "Fast as a bullet, the golden hunter vaults over the idle guard.",
+    "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+    "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+    "A speedy yellow-brown predator bypasses a motionless dog.",
+    "Log entry: Vulpine subject achieves swift vertical displacement.",
+    # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+    "A fleet-footed chestnut fox soars above an indolent canine.",
+    "The swift chestnut fox soars above an indolent canine.",
+    "The fleet-footed brown fox soars above an indolent canine.",
+    "The fleet-footed chestnut fox leaps above an indolent canine.",
+    "The fleet-footed chestnut fox soars above a lazy canine.",
+    "Several fleet-footed chestnut foxes soar above an indolent canine.",
+    "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+    "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+    "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+    "Single agile V. vulpes achieves elevation above stationary canine.",
+    "A nimble brown predator glides over an unmoving domestic animal.",
+    "Research note: Brown subject displays superior vertical mobility.",
+    # Original 7: "A fast ginger fox hurdles past a slothful dog."
+    "The fast ginger fox hurdles past a slothful dog.",
+    "A quick ginger fox hurdles past a slothful dog.",
+    "A fast red fox hurdles past a slothful dog.",
+    "A fast ginger fox jumps past a slothful dog.",
+    "A fast ginger fox hurdles past a lazy dog.",
+    "Five fast ginger foxes hurdle past a slothful dog.",
+    "A rapid orange vulpine bypasses a lethargic canine.",
+    "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+    "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+    "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+    "A speedy red-orange predator overtakes a motionless dog.",
+    "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+    # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+    "A spry rusty-colored fox jumps across a dozing hound.",
+    "The agile rusty-colored fox jumps across a dozing hound.",
+    "The spry reddish fox jumps across a dozing hound.",
+    "The spry rusty-colored fox leaps across a dozing hound.",
+    "The spry rusty-colored fox jumps across a sleeping hound.",
+    "Multiple spry rusty-colored foxes jump across a dozing hound.",
+    "An agile rust-toned vulpine traverses a somnolent canine.",
+    "Nimble as thought, the copper hunter bounds over the resting guard.",
+    "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+    "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+    "A lithe rust-tinted predator moves past a slumbering dog.",
+    "Observation: Russet subject exhibits agility over dormant subject.",
+    # Original 9: "A quick tan fox leaps over an inactive dog."
+    "The quick tan fox leaps over an inactive dog.",
+    "A swift tan fox leaps over an inactive dog.",
+    "A quick beige fox leaps over an inactive dog.",
+    "A quick tan fox jumps over an inactive dog.",
+    "A quick tan fox leaps over a motionless dog.",
+    "Seven quick tan foxes leap over an inactive dog.",
+    "A rapid light-brown vulpine surpasses a stationary canine.",
+    "Fast as wind, the sand-colored hunter soars over the still guard.",
+    "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+    "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+    "A fleet tan-colored predator bypasses an unmoving dog.",
+    "Field report: Tan subject demonstrates movement over static subject.",
+    # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+    "Some brisk auburn vulpines bounce over a listless canine.",
+    "The quick auburn vulpine bounces over a listless canine.",
+    "The brisk russet vulpine bounces over a listless canine.",
+    "The brisk auburn fox bounces over a listless canine.",
+    "The brisk auburn vulpine jumps over a listless canine.",
+    "Five brisk auburn vulpines bounce over a listless canine.",
+    "The expeditious specimen supersedes a quiescent Canis lupus.",
+    "Swift as wind, the russet hunter vaults over the idle guardian.",
+    "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+    "One V. vulpes achieves displacement over inactive C. familiaris.",
+    "A high-velocity auburn predator traverses an immobile animal.",
+    "Final observation: Red subject shows mobility over Gray subject."
+    ]
+    distortion_calculator = SentenceDistortionCalculator(config, original_sentence, paraphrased_sentences)
+    for _ in tqdm(range(1)):
+        distortion_calculator.calculate_all_metrics()
+        distortion_calculator.normalize_metrics()
+        distortion_calculator.calculate_combined_distortion()
+        distortion_calculator.plot_metrics()
+    print("Normalized Metrics:", distortion_calculator.get_normalized_metrics())
+    print("Combined Distortion:", distortion_calculator.get_combined_distortions())

renderers/__pycache__/highlighter.cpython-310.pyc ADDED Viewed

Binary file (4.98 kB). View file

renderers/__pycache__/highlighter.cpython-311.pyc ADDED Viewed

Binary file (6.79 kB). View file

renderers/__pycache__/plot_3d.cpython-310.pyc ADDED Viewed

Binary file (4.34 kB). View file

renderers/__pycache__/plot_3d.cpython-311.pyc ADDED Viewed

Binary file (6 kB). View file

renderers/__pycache__/tree.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

renderers/__pycache__/tree.cpython-311.pyc ADDED Viewed

Binary file (21.1 kB). View file

renderers/highlighter.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import re
+def highlight_common_words(common_words, sentences, title):
+    """
+    Highlight common words in sentences by adding color-coded background and unique IDs.
+    Args:
+        common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
+        sentences (list of str): List of sentences to search through.
+        title (str): The title for the HTML output.
+    Returns:
+        str: HTML string with the highlighted sentences.
+    """
+    color_map = {}
+    color_index = 0
+    highlighted_html = []
+    # Process each sentence
+    for idx, sentence in enumerate(sentences, start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        highlighted_sentence = sentence_with_idx
+        # Highlight common words in each sentence
+        for index, word in common_words:
+            if word not in color_map:
+                color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
+                color_index += 1
+            # Escape word and create regex pattern to match whole word
+            escaped_word = re.escape(word)
+            pattern = rf'\b{escaped_word}\b'
+            # Replace the word with highlighted version
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m, idx=index, color=color_map[word]: (
+                    f'<span style="background-color: {color}; font-weight: bold;'
+                    f' padding: 2px 4px; border-radius: 2px; position: relative;">'
+                    f'<span style="background-color: black; color: white; border-radius: 50%;'
+                    f' padding: 2px 5px; margin-right: 5px;">{idx}</span>'
+                    f'{m.group(0)}'
+                    f'</span>'
+                ),
+                highlighted_sentence,
+                flags=re.IGNORECASE
+            )
+        highlighted_html.append(highlighted_sentence)
+    # Format the HTML output with the title
+    final_html = "<br><br>".join(highlighted_html)
+    return f'''
+    <div style="border: solid 1px #FFFFFF; padding: 16px; background-color: #000000; color: #FFFFFF; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
+        <div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
+    </div>
+    '''
+def highlight_common_words_dict(common_words, sentences, title):
+    """
+    Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs.
+    Args:
+        common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
+        sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score.
+        title (str): The title for the HTML output.
+    Returns:
+        str: HTML string with the highlighted sentences and their entailment scores.
+    """
+    color_map = {}
+    color_index = 0
+    highlighted_html = []
+    # Process each sentence and its score
+    for idx, (sentence, score) in enumerate(sentences.items(), start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        highlighted_sentence = sentence_with_idx
+        # Highlight common words in each sentence
+        for index, word in common_words:
+            if word not in color_map:
+                color_map[word] = f'hsl({color_index * 60 % 360}, 70%, 80%)'
+                color_index += 1
+            # Escape word and create regex pattern to match whole word
+            escaped_word = re.escape(word)
+            pattern = rf'\b{escaped_word}\b'
+            # Replace the word with highlighted version
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m, idx=index, color=color_map[word]: (
+                    f'<span style="background-color: {color}; font-weight: bold;'
+                    f' padding: 1px 2px; border-radius: 2px; position: relative;">'
+                    f'<span style="background-color: black; color: white; border-radius: 50%;'
+                    f' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{idx}</span>'
+                    f'{m.group(0)}'
+                    f'</span>'
+                ),
+                highlighted_sentence,
+                flags=re.IGNORECASE
+            )
+        # Add the entailment score
+        highlighted_html.append(
+            f'<div style="margin-bottom: 5px;">'
+            f'{highlighted_sentence}'
+            f'<div style="display: inline-block; margin-left: 5px; padding: 3px 5px; border-radius: 3px; '
+            f'background-color: #333333; color: white; font-size: 0.9em;">'
+            f'Entailment Score: {score}</div></div>'
+        )
+    # Format the HTML output with the title
+    final_html = "<br>".join(highlighted_html)
+    return f'''
+    <div style="background-color: #000000; color: #FFFFFF;border: solid 1px #FFFFFF; border-radius: 8px;">
+        <h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
+        <div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
+    </div>
+    '''
+def reparaphrased_sentences_html(sentences):
+    """
+    Create an HTML representation of sentences with numbering.
+    Args:
+        sentences (list of str): List of sentences to format.
+    Returns:
+        str: HTML string with numbered sentences.
+    """
+    formatted_sentences = []
+    # Process each sentence
+    for idx, sentence in enumerate(sentences, start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        formatted_sentences.append(sentence_with_idx)
+    # Format the HTML output
+    final_html = "<br><br>".join(formatted_sentences)
+    return f'''
+    <div style="border: solid 1px #FFFFFF; background-color: #000000; color: #FFFFFF;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
+    </div>
+    '''
+if __name__ == "__main__":
+    # Example usage
+    common_words = [(1, "highlight"), (2, "numbering")]
+    sentences = ["This is a test to highlight words.", "Numbering is important for clarity."]
+    # Test highlight_common_words
+    highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
+    print(highlighted_html)
+    # Test highlight_common_words_dict
+    sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8}
+    highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
+    print(highlighted_html_dict)

renderers/plot_3d.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+This file contains the code to plot a 3d tree
+"""
+import numpy as np
+import plotly.graph_objects as go
+from scipy.interpolate import griddata
+def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
+    """
+    Generates a 3D surface plot showing the relationship between detectability, distortion,
+    and Euclidean distance, with a focus on highlighting the "sweet spot" based on a composite score.
+    The function takes three sets of values: detectability, distortion, and Euclidean distance,
+    normalizes them to a [0, 1] range, and computes a composite score that combines these three metrics.
+    The "sweet spot" is the point where the composite score is maximized. This sweet spot is plotted
+    as a red marker on the 3D surface plot.
+    The function then uses a grid interpolation method (`griddata`) to generate a smooth surface
+    for the Euclidean distance over the detectability and distortion values. The result is a surface plot
+    where the contours represent different Euclidean distances.
+    Args:
+        detectability_val (list or array): A list or array of detectability scores.
+        distortion_val (list or array): A list or array of distortion scores.
+        euclidean_val (list or array): A list or array of Euclidean distances.
+    Returns:
+        plotly.graph_objects.Figure: A Plotly figure object representing the 3D surface plot,
+                                     with contour lines and a marker for the sweet spot.
+    Raises:
+        ValueError: If `griddata` fails to generate a valid interpolation, which could happen if the
+                    input data does not allow for a proper interpolation.
+    Example:
+        # Example of usage:
+        detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
+        distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0]
+        euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6]
+        fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals)
+        fig.show()  # Displays the plot in a web browser
+    Notes:
+        - The composite score is calculated as:
+          `composite_score = norm_detectability - (norm_distortion + norm_euclidean)`,
+          where the goal is to maximize detectability and minimize distortion and Euclidean distance.
+        - The `griddata` function uses linear interpolation to create a smooth surface for the plot.
+        - The function uses the "Plasma" colorscale for the surface plot, which provides a perceptually uniform color scheme.
+    """
+    detectability = np.array(detectability_val)
+    distortion = np.array(distortion_val)
+    euclidean = np.array(euclidean_val)
+    # Normalize the values to range [0, 1]
+    norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability))
+    norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion))
+    norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean))
+    # Composite score: maximize detectability, minimize distortion and Euclidean distance
+    composite_score = norm_detectability - (norm_distortion + norm_euclidean)
+    # Find the index of the maximum score (sweet spot)
+    sweet_spot_index = np.argmax(composite_score)
+    # Sweet spot values
+    sweet_spot_detectability = detectability[sweet_spot_index]
+    sweet_spot_distortion = distortion[sweet_spot_index]
+    sweet_spot_euclidean = euclidean[sweet_spot_index]
+    # Create a meshgrid from the data
+    x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
+                                 np.linspace(min(distortion), max(distortion), 30))
+    # Interpolate z values (Euclidean distances) to fit the grid using 'nearest' method
+    z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='nearest')
+    if z_grid is None:
+        raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
+    # Create the 3D contour plot with the Plasma color scale
+    fig = go.Figure(data=go.Surface(
+        z=z_grid,
+        x=x_grid,
+        y=y_grid,
+        contours={
+            "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
+        },
+        colorscale='Plasma'
+    ))
+    # Add a marker for the sweet spot
+    fig.add_trace(go.Scatter3d(
+        x=[sweet_spot_detectability],
+        y=[sweet_spot_distortion],
+        z=[sweet_spot_euclidean],
+        mode='markers+text',
+        marker=dict(size=10, color='red', symbol='circle'),
+        text=["Sweet Spot"],
+        textposition="top center"
+    ))
+    # Set axis labels
+    fig.update_layout(
+        scene=dict(
+            xaxis_title='Detectability Score',
+            yaxis_title='Distortion Score',
+            zaxis_title='Euclidean Distance'
+        ),
+        margin=dict(l=0, r=0, b=0, t=0)
+    )
+    return fig
+if __name__ == "__main__":
+    # Example input data
+    detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
+    distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0]
+    euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6]
+    # Call the function with example data
+    fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals)
+    # Show the plot
+    fig.show()

renderers/tree.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import plotly.graph_objects as go
+import textwrap
+import re
+from collections import defaultdict
+def generate_subplot1(paraphrased_sentence, masked_sentences, strategies, highlight_info, common_grams):
+    """
+    Generates a subplot visualizing paraphrased and masked sentences in a tree structure.
+    Highlights common words with specific colors and applies Longest Common Subsequence (LCS) numbering.
+    Args:
+        paraphrased_sentence (str): The paraphrased sentence to be visualized.
+        masked_sentences (list of str): A list of masked sentences to be visualized.
+        strategies (list of str, optional): List of strategies used for each masked sentence.
+        highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting.
+        common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering.
+    Returns:
+        plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges.
+    """
+    # Combine nodes into one list with appropriate labels
+    if isinstance(masked_sentences, str):
+        masked_sentences = [masked_sentences]
+    nodes = [paraphrased_sentence] + masked_sentences
+    nodes[0] += ' L0'  # Paraphrased sentence is level 0
+    if len(nodes) < 2:
+        print("[ERROR] Insufficient nodes for visualization")
+        return go.Figure()
+    for i in range(1, len(nodes)):
+        nodes[i] += ' L1'  # masked sentences are level 1
+    def apply_lcs_numbering(sentence, common_grams):
+        """
+        Applies LCS numbering to the sentence based on the common_grams.
+        Args:
+            sentence (str): The sentence to which the LCS numbering should be applied.
+            common_grams (list of tuples): A list of common grams to be replaced with LCS numbers.
+        Returns:
+            str: The sentence with LCS numbering applied.
+        """
+        for idx, lcs in common_grams:
+            sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+        return sentence
+    # Apply LCS numbering
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    def highlight_words(sentence, color_map):
+        """
+        Highlights words in the sentence based on the color_map.
+        Args:
+            sentence (str): The sentence where the words will be highlighted.
+            color_map (dict): A dictionary mapping words to their colors.
+        Returns:
+            str: The sentence with highlighted words.
+        """
+        for word, color in color_map.items():
+            sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+        return sentence
+    # Clean and wrap nodes, and highlight specified words globally
+    cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    global_color_map = dict(highlight_info)
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes]
+    def get_levels_and_edges(nodes, strategies=None):
+        """
+        Determines tree levels and creates edges dynamically.
+        Args:
+            nodes (list of str): The nodes representing the sentences.
+            strategies (list of str, optional): The strategies used for each edge.
+        Returns:
+            tuple: A tuple containing two dictionaries:
+                - levels: A dictionary mapping node indices to their levels.
+                - edges: A list of edges where each edge is represented by a tuple of node indices.
+        """
+        levels = {}
+        edges = []
+        for i, node in enumerate(nodes):
+            level = int(node.split()[-1][1])
+            levels[i] = level
+        # Add edges from L0 to all L1 nodes
+        root_node = next((i for i, level in levels.items() if level == 0), 0)
+        for i, level in levels.items():
+            if level == 1:
+                edges.append((root_node, i))
+        return levels, edges
+    # Get levels and dynamic edges
+    levels, edges = get_levels_and_edges(nodes, strategies)
+    max_level = max(levels.values(), default=0)
+    # Calculate positions
+    positions = {}
+    level_heights = defaultdict(int)
+    for node, level in levels.items():
+        level_heights[level] += 1
+    y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+    x_gap = 2
+    l1_y_gap = 10
+    for node, level in levels.items():
+        if level == 1:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        else:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        y_offsets[level] += 1
+    def color_highlighted_words(node, color_map):
+        """
+        Colors the highlighted words in the node text.
+        Args:
+            node (str): The node text to be highlighted.
+            color_map (dict): A dictionary mapping words to their colors.
+        Returns:
+            str: The node text with highlighted words.
+        """
+        parts = re.split(r'(\{\{.*?\}\})', node)
+        colored_parts = []
+        for part in parts:
+            match = re.match(r'\{\{(.*?)\}\}', part)
+            if match:
+                word = match.group(1)
+                color = color_map.get(word, 'black')
+                colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+            else:
+                colored_parts.append(part)
+        return ''.join(colored_parts)
+    # Define the text for each edge
+    default_edge_texts = [
+        "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
+        "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+        "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
+        "Exponential Minimum Sampling", "Inverse Transform Sampling", "Greedy Sampling",
+        "Temperature Sampling", "Exponential Minimum Sampling", "Inverse Transform Sampling"
+    ]
+    if len(nodes) < 2:
+        print("[ERROR] Insufficient nodes for visualization")
+        return go.Figure()
+    # Create figure
+    fig1 = go.Figure()
+    # Add nodes to the figure
+    for i, node in enumerate(wrapped_nodes):
+        colored_node = color_highlighted_words(node, global_color_map)
+        x, y = positions[i]
+        fig1.add_trace(go.Scatter(
+            x=[-x],  # Reflect the x coordinate
+            y=[y],
+            mode='markers',
+            marker=dict(size=20, color='blue', line=dict(color='black', width=2)),
+            hoverinfo='none'
+        ))
+        fig1.add_annotation(
+            x=-x,  # Reflect the x coordinate
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="center",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=2,
+            borderpad=4,
+            bgcolor='white',
+            width=400,
+            height=100
+        )
+    # Add edges and text above each edge
+    for i, edge in enumerate(edges):
+        x0, y0 = positions[edge[0]]
+        x1, y1 = positions[edge[1]]
+        # Use strategy if available, otherwise use default edge text
+        if strategies and i < len(strategies):
+            edge_text = strategies[i]
+        else:
+            edge_text = default_edge_texts[i % len(default_edge_texts)]
+        fig1.add_trace(go.Scatter(
+            x=[-x0, -x1],  # Reflect the x coordinates
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Calculate the midpoint of the edge
+        mid_x = (-x0 + -x1) / 2
+        mid_y = (y0 + y1) / 2
+        # Adjust y position to shift text upwards
+        text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
+        # Add text annotation above the edge
+        fig1.add_annotation(
+            x=mid_x,
+            y=text_y_position,
+            text=edge_text,  # Use the text specific to this edge
+            showarrow=False,
+            font=dict(size=12),
+            align="center"
+        )
+    fig1.update_layout(
+        showlegend=False,
+        margin=dict(t=50, b=50, l=50, r=50),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=800 + max_level * 200,  # Adjusted width to accommodate more levels
+        height=300 + len(nodes) * 100,   # Adjusted height to accommodate more levels
+        plot_bgcolor='rgba(240,240,240,0.2)',
+        paper_bgcolor='white'
+    )
+    return fig1
+def generate_subplot2(masked_sentences, sampled_sentences, highlight_info, common_grams):
+    """
+    Generates a subplot visualizing multiple masked sentences and their sampled variants in a tree structure.
+    Each masked sentence will have multiple sampled sentences derived from it using different sampling techniques.
+    Args:
+        masked_sentences (list of str): A list of masked sentences to be visualized as root nodes.
+        sampled_sentences (list of str): A list of sampled sentences derived from masked sentences.
+        highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting.
+        common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering.
+    Returns:
+        plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges.
+    """
+    # Define sampling techniques
+    sampling_techniques = [
+        "Greedy Sampling",
+        "Temperature Sampling",
+        "Exponential Minimum Sampling",
+        "Inverse Transform Sampling"
+    ]
+    # Calculate total number of nodes
+    num_masked = len(masked_sentences)
+    num_sampled_per_masked = len(sampling_techniques)
+    total_nodes = num_masked + (num_masked * num_sampled_per_masked)
+    # Combine all sentences into nodes list with appropriate labels
+    nodes = []
+    # Level 0: masked sentences (root nodes)
+    nodes.extend([s + ' L0' for s in masked_sentences])
+    # Level 1: sampled sentences (branch nodes)
+    # For each masked sentence, we should have samples from each technique
+    sampled_nodes = []
+    # Validate if we have the expected number of sampled sentences
+    expected_sampled_count = num_masked * num_sampled_per_masked
+    if len(sampled_sentences) < expected_sampled_count:
+        # If insufficient samples provided, pad with placeholder sentences
+        print(f"Warning: Expected {expected_sampled_count} sampled sentences, but got {len(sampled_sentences)}")
+        while len(sampled_sentences) < expected_sampled_count:
+            sampled_sentences.append(f"Placeholder sampled sentence {len(sampled_sentences) + 1}")
+    # Add all sampled sentences with level information
+    for s in sampled_sentences[:expected_sampled_count]:
+        sampled_nodes.append(s + ' L1')
+    nodes.extend(sampled_nodes)
+    def apply_lcs_numbering(sentence, common_grams):
+        """
+        Applies LCS numbering to the sentence based on the common_grams.
+        """
+        for idx, lcs in common_grams:
+            sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+        return sentence
+    # Apply LCS numbering
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    def highlight_words(sentence, color_map):
+        """
+        Highlights words in the sentence based on the color_map.
+        """
+        for word, color in color_map.items():
+            sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+        return sentence
+    # Helper function to color highlighted words
+    def color_highlighted_words(node, color_map):
+        """
+        Colors the highlighted words in the node text.
+        """
+        parts = re.split(r'(\{\{.*?\}\})', node)
+        colored_parts = []
+        for part in parts:
+            match = re.match(r'\{\{(.*?)\}\}', part)
+            if match:
+                word = match.group(1)
+                color = color_map.get(word, 'black')
+                colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+            else:
+                colored_parts.append(part)
+        return ''.join(colored_parts)
+    # Clean nodes, highlight words, and wrap text
+    cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    global_color_map = dict(highlight_info)
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
+    # Generate edges based on the tree structure
+    def get_levels_and_edges(nodes):
+        levels = {}
+        edges = []
+        # Extract level info from node labels
+        for i, node in enumerate(nodes):
+            level = int(node.split()[-1][1])
+            levels[i] = level
+        # Create edges from masked sentences to their sampled variants
+        for masked_idx in range(num_masked):
+            # For each masked sentence, create edges to its sampled variants
+            for technique_idx in range(num_sampled_per_masked):
+                sampled_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx
+                if sampled_idx < len(nodes):
+                    edges.append((masked_idx, sampled_idx))
+        return levels, edges
+    levels, edges = get_levels_and_edges(nodes)
+    # Calculate positions with improved spacing
+    positions = {}
+    # Calculate horizontal spacing for the root nodes (masked sentences)
+    root_x_spacing = 0  # All root nodes at x=0
+    root_y_spacing = 8.0  # Vertical spacing between root nodes
+    # Calculate positions for sampled nodes
+    sampled_x = 3  # X position for all sampled nodes
+    # Calculate y positions for root nodes (masked sentences)
+    root_y_start = -(num_masked - 1) * root_y_spacing / 2
+    for i in range(num_masked):
+        positions[i] = (root_x_spacing, root_y_start + i * root_y_spacing)
+    # Calculate y positions for sampled nodes
+    for masked_idx in range(num_masked):
+        root_y = positions[masked_idx][1]  # Y position of parent masked sentence
+        # Calculate y-spacing for children of this root
+        children_y_spacing = 1.5  # Vertical spacing between children of the same root
+        children_y_start = root_y - (num_sampled_per_masked - 1) * children_y_spacing / 2
+        # Position each child
+        for technique_idx in range(num_sampled_per_masked):
+            child_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx
+            child_y = children_y_start + technique_idx * children_y_spacing
+            positions[child_idx] = (sampled_x, child_y)
+    # Create figure
+    fig2 = go.Figure()
+    # Add nodes
+    for i, node in enumerate(wrapped_nodes):
+        x, y = positions[i]
+        # Define node color based on level
+        node_color = 'blue' if levels[i] == 0 else 'green'
+        # Add the node marker
+        fig2.add_trace(go.Scatter(
+            x=[x],
+            y=[y],
+            mode='markers',
+            marker=dict(size=20, color=node_color, line=dict(color='black', width=2)),
+            hoverinfo='none'
+        ))
+        # Add node label with highlighting
+        colored_node = color_highlighted_words(node, global_color_map)
+        fig2.add_annotation(
+            x=x,
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="left",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=2,
+            borderpad=4,
+            bgcolor='white',
+            width=400,
+            height=100
+        )
+    # Add edges with labels
+    for i, (src, dst) in enumerate(edges):
+        x0, y0 = positions[src]
+        x1, y1 = positions[dst]
+        # Draw the edge
+        fig2.add_trace(go.Scatter(
+            x=[x0, x1],
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Add sampling technique label
+        # Determine which sampling technique this is
+        parent_idx = src
+        technique_count = sum(1 for k, (s, _) in enumerate(edges) if s == parent_idx and k < i)
+        technique_label = sampling_techniques[technique_count % len(sampling_techniques)]
+        # Calculate midpoint for the label
+        mid_x = (x0 + x1) / 2
+        mid_y = (y0 + y1) / 2
+        # Add slight offset to avoid overlap
+        label_offset = 0.1
+        fig2.add_annotation(
+            x=mid_x,
+            y=mid_y + label_offset,
+            text=technique_label,
+            showarrow=False,
+            font=dict(size=8),
+            align="center"
+        )
+    # Update layout
+    fig2.update_layout(
+        showlegend=False,
+        margin=dict(t=20, b=20, l=20, r=20),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1200,  # Adjusted width to accommodate more levels
+        height=2000,   # Adjusted height to accommodate more levels
+        plot_bgcolor='rgba(240,240,240,0.2)',
+        paper_bgcolor='white'
+    )
+    return fig2
+if __name__ == "__main__":
+    paraphrased_sentence = "The quick brown fox jumps over the lazy dog."
+    masked_sentences = [
+        "A fast brown fox leaps over the lazy dog.",
+        "A quick brown fox hops over a lazy dog."
+    ]
+    highlight_info = [
+        ("quick", "red"),
+        ("brown", "green"),
+        ("fox", "blue"),
+        ("lazy", "purple")
+    ]
+    common_grams = [
+        (1, "quick brown fox"),
+        (2, "lazy dog")
+    ]
+    fig1 = generate_subplot1(paraphrased_sentence, masked_sentences, highlight_info, common_grams)
+    fig1.show()
+    sampled_sentence = ["A fast brown fox jumps over a lazy dog."]
+    fig2 = generate_subplot2(masked_sentences, sampled_sentence, highlight_info, common_grams)
+    fig2.show()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from utils.watermark import Watermarker
+from utils.paraphraser import Paraphraser
+from utils.entailment import EntailmentAnalyzer
+from utils.sampling import SamplingProcessor
+from utils.config import load_config

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (404 Bytes). View file

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (509 Bytes). View file

utils/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (594 Bytes). View file

utils/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (971 Bytes). View file

utils/__pycache__/entailment.cpython-310.pyc ADDED Viewed

Binary file (3.69 kB). View file

utils/__pycache__/entailment.cpython-311.pyc ADDED Viewed

Binary file (5.33 kB). View file

utils/__pycache__/masking_methods.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

utils/__pycache__/masking_methods.cpython-311.pyc ADDED Viewed

Binary file (23.5 kB). View file

utils/__pycache__/non_melting_point.cpython-310.pyc ADDED Viewed

Binary file (5.05 kB). View file

utils/__pycache__/non_melting_point.cpython-311.pyc ADDED Viewed

Binary file (9.08 kB). View file

utils/__pycache__/paraphraser.cpython-310.pyc ADDED Viewed

Binary file (2.85 kB). View file

utils/__pycache__/paraphraser.cpython-311.pyc ADDED Viewed

Binary file (4.89 kB). View file

utils/__pycache__/sampling.cpython-310.pyc ADDED Viewed

Binary file (5.06 kB). View file

utils/__pycache__/sampling.cpython-311.pyc ADDED Viewed

Binary file (9.2 kB). View file

utils/__pycache__/watermark.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

utils/__pycache__/watermark.cpython-311.pyc ADDED Viewed

Binary file (20.1 kB). View file

utils/config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+This file loads config from config.yaml
+"""
+import yaml
+def load_config(path):
+    """
+    Function to load config from config.yaml
+    """
+    try:
+        with open(path, "r") as file:
+            config = yaml.safe_load(file)
+        return config
+    except FileNotFoundError:
+        raise FileNotFoundError("Config file not found")
+    except Exception as e:
+        raise e

utils/config.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# This is the official config file.
+PECCAVI_TEXT:
+  Entailment:
+    task: "text-classification"
+    model: "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
+  Masking:
+    task: "fill-mask"
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+  Vocabulary:
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # permissible_ratio: 0.5
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+    permissible_ratio: 1.0
+  Sampling:
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+  Metrics:
+    EuclideanDistance: "sentence-transformers/all-MiniLM-L6-v2"
+    Distortion: "gpt2"
+  Detector:
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+  Paraphrase:
+    tokenizer: "humarin/chatgpt_paraphraser_on_T5_base"
+    model: "humarin/chatgpt_paraphraser_on_T5_base"
+    num_beams: 10
+    num_beam_groups: 10
+    num_return_sequences: 10
+    repetition_penalty: 10.0
+    diversity_penalty: 3.0
+    no_repeat_ngram_size: 2
+    temperature: 0.7
+    max_length: 64

utils/entailment.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import numpy as np
+from transformers import pipeline
+from typing import List
+from utils.config import load_config
+class EntailmentAnalyzer:
+    # def __init__(self, config_path: str):
+    def __init__(self, config):
+        """
+        Initialize the EntailmentAnalyzer with the config file path.
+        Args:
+        config_path: The path to the configuration file.
+        """
+        # self.config = load_config(config_path)['PECCAVI_TEXT']['Entailment']
+        self.config = config
+        self.entailment_pipeline = pipeline(task=self.config['task'], model=self.config['model'])
+    def check_entailment(self, premise: str, hypothesis: str) -> float:
+        """
+        Check entailment between the premise and hypothesis.
+        Args:
+        premise: The premise sentence.
+        hypothesis: The hypothesis sentence.
+        Returns:
+        float: The entailment score.
+        """
+        results = self.entailment_pipeline(f"{premise} [SEP] {hypothesis}", top_k=None)
+        entailment_score = next(item['score'] for item in results if item['label'] == 'entailment')
+        return entailment_score
+    def analyze_entailment(self, original_sentence: str, paraphrased_sentences: List[str], threshold: float) -> tuple:
+        """
+        Analyze entailment scores for paraphrased sentences. If no selected sentences are found,
+        lower the threshold and rerun the analysis.
+        Args:
+        original_sentence: The original sentence.
+        paraphrased_sentences: List of paraphrased sentences.
+        threshold: Minimum score to select a sentence.
+        Returns:
+        tuple: A dictionary of all scores, selected sentences, and discarded sentences.
+        """
+        all_sentences = {}
+        selected_sentences = {}
+        discarded_sentences = {}
+        # Loop to reduce threshold if no sentences are selected
+        while not selected_sentences:
+            for paraphrased_sentence in paraphrased_sentences:
+                entailment_score = self.check_entailment(original_sentence, paraphrased_sentence)
+                all_sentences[paraphrased_sentence] = entailment_score
+                if entailment_score >= threshold:
+                    selected_sentences[paraphrased_sentence] = entailment_score
+                else:
+                    discarded_sentences[paraphrased_sentence] = entailment_score
+            # If no sentences are selected, lower the threshold
+            if not selected_sentences:
+                print(f"No selected sentences found. Lowering the threshold by 0.1 (from {threshold} to {threshold - 0.1}).")
+                threshold -= 0.1
+                if threshold <= 0:
+                    print("Threshold has reached 0. No sentences meet the criteria.")
+                    break
+        return all_sentences, selected_sentences, discarded_sentences
+if __name__ == "__main__":
+    config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+    config_path = '/home/ashhar21137/text_wm/scratch/utils/config/config.yaml'
+    config = load_config(config_path)
+    entailment_analyzer = EntailmentAnalyzer(config['PECCAVI_TEXT']['Entailment'])
+    all_sentences, selected_sentences, discarded_sentences = entailment_analyzer.analyze_entailment(
+        "The weather is nice today",
+        [
+            "The climate is pleasant today",
+            "It's a good day weather-wise",
+            "Today, the weather is terrible",
+            "What a beautiful day it is",
+            "The sky is clear and the weather is perfect",
+            "It's pouring rain outside today",
+            "The weather isn't bad today",
+            "A lovely day for outdoor activities"
+        ],
+        0.7
+    )
+    print("----------------------- All Sentences -----------------------")
+    print(all_sentences)
+    print("----------------------- Discarded Sentences -----------------------")
+    print(discarded_sentences)
+    print("----------------------- Selected Sentences -----------------------")
+    print(selected_sentences)

utils/masking_methods.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import random
+import torch
+import logging
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+from transformers import RobertaTokenizer, RobertaForMaskedLM
+from tqdm import tqdm
+# Set logging to WARNING for a cleaner terminal.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self, tokenizer, model):
+        self.tokenizer = tokenizer
+        self.model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.stop_words = set(stopwords.words('english'))
+        tqdm.write(f"[MaskingProcessor] Initialized on device: {self.device}")
+    def remove_stopwords(self, words):
+        return [word for word in words if word.lower() not in self.stop_words]
+    def adjust_ngram_indices(self, original_words, common_ngrams):
+        logger.info("Adjusting n-gram indices.")
+        non_stop_words = self.remove_stopwords(original_words)
+        original_to_non_stop = []
+        non_stop_idx = 0
+        for original_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                original_to_non_stop.append((original_idx, non_stop_idx))
+                non_stop_idx += 1
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+                    new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+                    adjusted_positions.append((new_start, new_end))
+                except StopIteration:
+                    continue
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, sentence, common_ngrams):
+        tqdm.write(f"[MaskingProcessor] Masking (random) sentence: {sentence}")
+        original_words = sentence.split()
+        has_punctuation = False
+        punctuation = ''
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                mask_indices.append(mask_index_before_ngram)
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    mask_indices.append(mask_index_between_ngrams)
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                mask_indices.append(mask_index_after_ngram)
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        if has_punctuation:
+            masked_words.append(punctuation)
+        logger.info(f"Masked sentence (random): {' '.join(masked_words)}")
+        return " ".join(masked_words), original_mask_indices
+    def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+        logger.info(f"Masking sentence using pseudorandom strategy: {sentence}")
+        random.seed(3)
+        original_words = sentence.split()
+        has_punctuation = False
+        punctuation = ''
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                mask_indices.append(mask_index_before_ngram)
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    mask_indices.append(mask_index_between_ngrams)
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                mask_indices.append(mask_index_after_ngram)
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        if has_punctuation:
+            masked_words.append(punctuation)
+        logger.info(f"Masked sentence (pseudorandom): {' '.join(masked_words)}")
+        return " ".join(masked_words), original_mask_indices
+    def mask_sentence_entropy(self, sentence, common_ngrams):
+        logger.info(f"Masking sentence using entropy strategy: {sentence}")
+        original_words = sentence.split()
+        has_punctuation = False
+        punctuation = ''
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                candidate_positions = range(0, first_ngram_start)
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    candidate_positions = range(end_prev + 1, start_next)
+                    entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions]
+                    mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos])) for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        if has_punctuation:
+            masked_words.append(punctuation)
+        logger.info(f"Masked sentence (entropy): {' '.join(masked_words)}")
+        return " ".join(masked_words), original_mask_indices
+    def calculate_mask_logits(self, original_sentence, original_mask_indices):
+        logger.info(f"Calculating mask logits for sentence: {original_sentence}")
+        words = original_sentence.split()
+        mask_logits = {}
+        for idx in original_mask_indices:
+            masked_words = words.copy()
+            masked_words[idx] = self.tokenizer.mask_token
+            masked_sentence = " ".join(masked_words)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device)
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            mask_logits_tensor = logits[0, mask_token_index, :]
+            top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1)
+            top_tokens = []
+            top_logits = []
+            seen_words = set()
+            for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]):
+                token = self.tokenizer.convert_ids_to_tokens(token_id.item())
+                if token.startswith('##'):
+                    continue
+                word = self.tokenizer.convert_tokens_to_string([token]).strip()
+                if word and word not in seen_words:
+                    seen_words.add(word)
+                    top_tokens.append(word)
+                    top_logits.append(logit.item())
+                    if len(top_tokens) == 50:
+                        break
+            mask_logits[idx] = {
+                "tokens": top_tokens,
+                "logits": top_logits
+            }
+        logger.info("Completed calculating mask logits.")
+        return mask_logits
+    def calculate_word_entropy(self, sentence, word_position):
+        logger.info(f"Calculating word entropy for position {word_position} in sentence: {sentence}")
+        words = sentence.split()
+        masked_words = words.copy()
+        masked_words[word_position] = self.tokenizer.mask_token
+        masked_sentence = " ".join(masked_words)
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device)
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+        entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+        logger.info(f"Computed entropy: {entropy.item()}")
+        return entropy.item()
+    def process_sentences(self, sentences_list, common_grams, method="random"):
+        tqdm.write(f"[MaskingProcessor] Processing sentences using method: {method}")
+        results = {}
+        for sentence, ngrams in tqdm(common_grams.items(), desc="Masking Sentences"):
+            words = sentence.split()
+            last_word = words[-1]
+            if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+                words[-1] = last_word[:-1]
+                punctuation = last_word[-1]
+                processed_sentence = " ".join(words) + " " + punctuation
+            else:
+                processed_sentence = sentence
+            if method == "random":
+                masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams)
+            elif method == "pseudorandom":
+                masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams)
+            else:  # entropy
+                masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams)
+            logits = self.calculate_mask_logits(processed_sentence, original_mask_indices)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+            logger.info(f"Processed sentence: {sentence}")
+        tqdm.write("[MaskingProcessor] Completed processing sentences.")
+        return results
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over small cat the lazy dog everyday again and again .",
+    ]
+    result_dict = {
+        'The quick brown fox jumps over small cat the lazy dog everyday again and again .': {
+            'brown fox': [(2, 3)],
+            'cat': [(7, 7)],
+            'dog': [(10, 10)]
+        }
+    }
+    processor = MaskingProcessor(
+        BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking"),
+        BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+    )
+    results_entropy = processor.process_sentences(sentences_list, common_grams, method="random")
+    for sentence, output in results_entropy.items():
+        logger.info(f"Original Sentence (Random): {sentence}")
+        logger.info(f"Masked Sentence (Random): {output['masked_sentence']}")

utils/non_melting_point.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import nltk
+import logging
+from nltk.corpus import stopwords
+from nltk.util import ngrams
+from collections import Counter
+import re
+from tqdm import tqdm
+# Set logging to WARNING for minimal console output.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+class NgramProcessor:
+    def __init__(self):
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
+            nltk.download('stopwords')
+        self.stop_words = set(stopwords.words('english'))
+        tqdm.write("[NgramProcessor] Initialized with stopwords.")
+    def remove_stopwords(self, text):
+        # No need for extensive logging inside this helper.
+        words = re.findall(r'\w+', text.lower())
+        filtered_words = [word for word in words if word not in self.stop_words]
+        return ' '.join(filtered_words)
+    def is_exact_match(self, ngram, sentences):
+        logger.info(f"Checking exact match for ngram: {ngram}")
+        result = all(ngram in sentence for sentence in sentences)
+        logger.info(f"Exact match result for '{ngram}': {result}")
+        return result
+    def is_substring_of_any(self, ngram, common_ngrams):
+        logger.info(f"Checking if ngram: {ngram} is substring of any common ngram.")
+        result = any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+        logger.info(f"Substring check result for '{ngram}': {result}")
+        return result
+    def find_filtered_ngrams(self, sentences):
+        from collections import Counter
+        tqdm.write("[NgramProcessor] Cleaning sentences...")
+        sentences_cleaned = [self.remove_stopwords(sentence)
+                             for sentence in tqdm(sentences, desc="Cleaning Sentences")]
+        ngram_lengths = [4, 3, 2, 1]
+        common_ngrams = []
+        result = {}
+        for n in ngram_lengths:
+            ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences_cleaned]
+            ngrams_counter = Counter(ngrams_list[0])
+            for ngram in ngrams_counter:
+                ngram_str = ' '.join(ngram)
+                if any(word in self.stop_words for word in ngram_str.split()):
+                    continue
+                if self.is_exact_match(ngram_str, sentences_cleaned) and not self.is_substring_of_any(ngram_str, common_ngrams):
+                    common_ngrams.append(ngram_str)
+        for sentence, cleaned_sentence in tqdm(zip(sentences, sentences_cleaned),
+                                                 total=len(sentences),
+                                                 desc="Mapping N-grams"):
+            sentence_result = {}
+            original_words = sentence.split()
+            cleaned_words = cleaned_sentence.split()
+            index_map = {}
+            cleaned_idx = 0
+            for orig_idx, word in enumerate(original_words):
+                if word.lower() not in self.stop_words:
+                    index_map[cleaned_idx] = orig_idx
+                    cleaned_idx += 1
+            for ngram in common_ngrams:
+                ngram_words = ngram.split()
+                indices = []
+                for i in range(len(cleaned_words) - len(ngram_words) + 1):
+                    if cleaned_words[i:i + len(ngram_words)] == ngram_words:
+                        if i in index_map:
+                            start_idx = index_map[i]
+                            end_idx = index_map.get(i + len(ngram_words) - 1, start_idx)
+                            if end_idx - start_idx == len(ngram_words) - 1:
+                                indices.append((start_idx, end_idx))
+                if indices:
+                    sentence_result[ngram] = indices
+            result[sentence] = sentence_result
+        return result
+    # def find_relative_order(self, sentence, common_ngrams):
+    #     from tqdm import tqdm
+    #     relative_order = []
+    #     for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
+    #         index = sentence.find(ngram)
+    #         if index != -1:
+    #             relative_order.append((index, ngram))
+    #     return sorted(relative_order)
+    def find_relative_order(self, sentence, common_ngrams):
+        from tqdm import tqdm
+        sentence = sentence.lower()
+        relative_order = []
+        for ngram in tqdm(common_ngrams, desc="Ordering N-grams", leave=False):
+            index = sentence.find(ngram.lower())
+            if index != -1:
+                relative_order.append((index, ngram))
+        sorted_pairs = sorted(relative_order)
+        return [(i+1, ngram) for i, (_, ngram) in enumerate(sorted_pairs)]
+# Example usage
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog .",
+        "A speedy brown fox jumps over a lazy dog.",
+        "A swift brown fox leaps over the lethargic dog.",
+    ]
+    processor = NgramProcessor()
+    common_ngrams = processor.find_filtered_ngrams(sentences)
+    print(common_ngrams)
+    # modified_output = list({
+    #     (indices[0][0], gram)
+    #     for grams in common_ngrams.values()
+    #     for gram, indices in grams.items()
+    # })
+    # print(modified_output)
+    logger.info(f"Common n-grams and their indices per sentence: {common_ngrams}")
+    for sentence in sentences:
+        order = processor.find_relative_order(sentence, common_ngrams[sentence])
+        logger.info(f"Sentence: {sentence} -> Order: {order}")
+"""
+{
+'The quick brown fox jumps over the lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
+'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]},
+'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(1, 2)], 'dog': [(5, 5)]}
+}
+"""

utils/old/masking/masking_methods.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self, ):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords):
+        """
+        Adjust indices of common n-grams after removing stop words.
+        Args:
+            words (list): List of words in the original sentence.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams and their indices.
+        """
+        if not remove_stopwords:
+            return common_ngrams
+        non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = non_stop_word_indices.index(start)
+                    new_end = non_stop_word_indices.index(end)
+                    adjusted_positions.append((new_start, new_end))
+                except ValueError:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    # def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+    #     """
+    #     Mask one word before the first common n-gram, one between two n-grams,
+    #     and one after the last common n-gram (random selection).
+    #     Args:
+    #         original_sentence (str): Original sentence
+    #         common_ngrams (dict): Common n-grams and their indices
+    #     Returns:
+    #         str: Masked sentence with original stop words retained
+    #     """
+    #     words = original_sentence.split()
+    #     if remove_stopwords:
+    #         non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+    #         non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+    #     else:
+    #         non_stop_words = words
+    #         non_stop_word_indices = list(range(len(words)))
+    #     # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+    #     adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+    #     mask_indices = []
+    #     # Handle before the first common n-gram
+    #     if adjusted_ngrams:
+    #         first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+    #         if first_ngram_start > 0:
+    #             mask_indices.append(random.randint(0, first_ngram_start - 1))
+    #     # Handle between common n-grams
+    #     ngram_positions = list(adjusted_ngrams.values())
+    #     for i in range(len(ngram_positions) - 1):
+    #         end_prev = ngram_positions[i][-1][1]
+    #         start_next = ngram_positions[i + 1][0][0]
+    #         if start_next > end_prev + 1:
+    #             mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+    #     # Handle after the last common n-gram
+    #     last_ngram_end = ngram_positions[-1][-1][1]
+    #     if last_ngram_end < len(non_stop_words) - 1:
+    #         mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1))
+    #     # Mask the chosen indices
+    #     original_masked_sentence = words[:]
+    #     # for idx in mask_indices:
+    #     #     if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+    #     #         non_stop_words[idx] = self.tokenizer.mask_token
+    #     #         original_masked_sentence[idx] = self.tokenizer.mask_token
+    #     for idx in mask_indices:
+    #         if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+    #             continue  # Skip if index belongs to common n-grams
+    #         if remove_stopwords:
+    #             original_idx = non_stop_word_indices[idx]  # Map back to original indices
+    #             original_masked_sentence[original_idx] = self.tokenizer.mask_token
+    #         else:
+    #             original_masked_sentence[idx] = self.tokenizer.mask_token
+    #     return " ".join(original_masked_sentence)
+    def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (random selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+            remove_stopwords (bool): Whether to remove stop words
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        if remove_stopwords:
+            non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+            non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        else:
+            non_stop_words = words
+            non_stop_word_indices = list(range(len(words)))
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        # Collect all indices corresponding to common n-grams
+        common_ngram_indices = {
+            idx for ngram_positions in adjusted_ngrams.values()
+            for start, end in ngram_positions
+            for idx in range(start, end + 1)
+        }
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            if first_ngram_start > 0:
+                potential_indices = [i for i in range(first_ngram_start) if i not in common_ngram_indices]
+                if potential_indices:
+                    mask_indices.append(random.choice(potential_indices))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            potential_indices = [i for i in range(end_prev + 1, start_next) if i not in common_ngram_indices]
+            if potential_indices:
+                mask_indices.append(random.choice(potential_indices))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        if last_ngram_end < len(non_stop_words) - 1:
+            potential_indices = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i not in common_ngram_indices]
+            if potential_indices:
+                mask_indices.append(random.choice(potential_indices))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        for idx in mask_indices:
+            if remove_stopwords:
+                original_idx = non_stop_word_indices[idx]  # Map back to original indices
+                original_masked_sentence[original_idx] = self.tokenizer.mask_token
+            else:
+                original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (highest entropy selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+        if remove_stopwords:
+            non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+            non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        else:
+            non_stop_words = words
+            non_stop_word_indices = list(range(len(words)))
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        entropy_scores = {}
+        for idx, word in enumerate(non_stop_words):
+            if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip words in common n-grams
+            masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:]
+            masked_sentence = " ".join(masked_sentence)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            filtered_logits = logits[0, mask_token_index, :]
+            probs = torch.softmax(filtered_logits, dim=-1)
+            entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()  # Add epsilon to prevent log(0)
+            entropy_scores[idx] = entropy
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores]
+        if candidates:
+            mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        # for idx in mask_indices:
+        #     non_stop_words[idx] = self.tokenizer.mask_token
+        #     original_masked_sentence[idx] = self.tokenizer.mask_token
+        for idx in mask_indices:
+            if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip if index belongs to common n-grams
+            if remove_stopwords:
+                original_idx = non_stop_word_indices[idx]  # Map back to original indices
+                original_masked_sentence[original_idx] = self.tokenizer.mask_token
+            else:
+                original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False):
+        """
+        Process a list of sentences and calculate logits for masked tokens using the specified method.
+        Args:
+            original_sentences (list): List of original sentences
+            result_dict (dict): Common n-grams and their indices for each sentence
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and their logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords)
+            elif method == "entropy":
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords)
+            else:
+                raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+# Example usage
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A speedy brown fox jumps over a lazy dog.",
+        "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over the lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    processor = MaskingProcessor()
+    results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=True)
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_random.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # # print(f"Mask Logits (Random): {output['mask_logits']}")
+        # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        print('--------------------------------')
+        # for mask_idx, logits in output["mask_logits"].items():
+        #     print(f"Logits for [MASK] at position {mask_idx}:")
+        #     print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+    # result_dict = {
+    #     "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    # }
+    # print('--------------------------------')
+    # for sentence, output in results_entropy.items():
+    #     print(f"Original Sentence (Entropy): {sentence}")
+    #     print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+    #     # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+    #     print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+    #     print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+    #     print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')

utils/old/masking/masking_methods_new_work.py ADDED Viewed

	@@ -0,0 +1,447 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def remove_stopwords(self, words):
+        """
+        Remove stopwords from the given list of words.
+        Args:
+            words (list): List of words.
+        Returns:
+            list: List of non-stop words.
+        """
+        return [word for word in words if word.lower() not in self.stop_words]
+    def adjust_ngram_indices(self, original_words, common_ngrams):
+        """
+        Adjust indices of common n-grams after removing stopwords.
+        Args:
+            original_words (list): Original list of words.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams with updated indices.
+        """
+        non_stop_words = self.remove_stopwords(original_words)
+        original_to_non_stop = []
+        non_stop_idx = 0
+        for original_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                original_to_non_stop.append((original_idx, non_stop_idx))
+                non_stop_idx += 1
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+                    new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+                    adjusted_positions.append((new_start, new_end))
+                except StopIteration:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        original_words = sentence.split()
+        print(f' ---- original_words : {original_words} ----- ')
+        non_stop_words = self.remove_stopwords(original_words)
+        print(f' ---- non_stop_words : {non_stop_words} ----- ')
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        print(f' ---- common_ngrams : {common_ngrams} ----- ')
+        print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+        mask_indices = []
+        # Extract n-gram positions in non-stop words
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        # Mask a word before the first common n-gram
+        if ngram_positions:
+            print(f' ---- ngram_positions : {ngram_positions} ----- ')
+            first_ngram_start = ngram_positions[0][0]
+            print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                print(f' ---- end_prev : {end_prev} ----- ') # END INDICE FROM PREV LOOP FUNKNLKNLKNLKNLKNLKNLSKDNFLKSDHJFLSDJKFH:KLSDHF:LHKSDF:HJKLDFS:HJKLDFSHJK:
+                start_next = ngram_positions[i + 1][0]
+                print(f' ---- start_next : {start_next} ----- ')
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices from non-stop word positions to original positions
+        print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+        # Apply masks to the original sentence
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        return " ".join(masked_words)
+    def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        random.seed(42)
+        original_words = sentence.split()
+        print(f' ---- original_words : {original_words} ----- ')
+        non_stop_words = self.remove_stopwords(original_words)
+        print(f' ---- non_stop_words : {non_stop_words} ----- ')
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        print(f' ---- common_ngrams : {common_ngrams} ----- ')
+        print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+        mask_indices = []
+        # Extract n-gram positions in non-stop words
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        # Mask a word before the first common n-gram
+        if ngram_positions:
+            print(f' ---- ngram_positions : {ngram_positions} ----- ')
+            first_ngram_start = ngram_positions[0][0]
+            print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                print(f' ---- end_prev : {end_prev} ----- ')
+                start_next = ngram_positions[i + 1][0]
+                print(f' ---- start_next : {start_next} ----- ')
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices from non-stop word positions to original positions
+        print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+        # Apply masks to the original sentence
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        return " ".join(masked_words)
+    def calculate_word_entropy(self, sentence, word_position):
+        """
+        Calculate entropy for a specific word position in the sentence.
+        Args:
+            sentence (str): The input sentence
+            word_position (int): Position of the word to calculate entropy for
+        Returns:
+            float: Entropy value for the word
+        """
+        words = sentence.split()
+        masked_words = words.copy()
+        masked_words[word_position] = self.tokenizer.mask_token
+        masked_sentence = " ".join(masked_words)
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        # Get probabilities for the masked position
+        probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+        # Calculate entropy: -sum(p * log(p))
+        entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+        return entropy.item()
+    def mask_sentence_entropy(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on entropy, following n-gram positioning rules.
+        Args:
+            sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        original_words = sentence.split()
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        original_to_non_stop = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                original_to_non_stop[orig_idx] = non_stop_idx
+                non_stop_idx += 1
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        mask_indices = []
+        if ngram_positions:
+            # Handle words before first n-gram
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                # Calculate entropy for all candidate positions
+                candidate_positions = range(0, first_ngram_start)
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                # Select position with highest entropy
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words between n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    candidate_positions = range(end_prev + 1, start_next)
+                    entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                                for pos in candidate_positions]
+                    mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words after last n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+        # Map mask indices to original sentence positions and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        return " ".join(masked_words)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+        Returns:
+            dict: Masked token indices and their logits.
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, sentences, result_dict, method="random"):
+        """
+        Process sentences and calculate logits for masked tokens.
+        Args:
+            sentences (list): List of sentences
+            result_dict (dict): Dictionary of common n-grams
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams)
+            elif method == "pseudorandom":
+                masked_sentence = self.mask_sentence_pseudorandom(sentence, ngrams)
+            else:  # entropy
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams)
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over the lazy dog everyday.",
+        # "A speedy brown fox jumps over a lazy dog.",
+        # "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    processor = MaskingProcessor()
+    # results_random = processor.process_sentences(sentences, result_dict)
+    results_entropy = processor.process_sentences(sentences, result_dict, method="random")
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_entropy.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        print('--------------------------------')
+        for mask_idx, logits in output["mask_logits"].items():
+            print(f"Logits for [MASK] at position {mask_idx}:")
+            print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+            print(f' len(logits) : {len(logits)}')
+# -------------------------------------------------------------------------------------------
+    # def mask_sentence(self, sentence, common_ngrams):
+    #     """
+    #     Mask words in the sentence based on the specified rules after removing stopwords.
+    #     Args:
+    #         sentence (str): Original sentence.
+    #         common_ngrams (dict): Common n-grams and their indices.
+    #     Returns:
+    #         str: Masked sentence.
+    #     """
+    #     original_words = sentence.split()
+    #     print(f' ---- original_words : {original_words} ----- ')
+    #     non_stop_words = self.remove_stopwords(original_words)
+    #     print(f' ---- non_stop_words : {non_stop_words} ----- ')
+    #     adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+    #     print(f' ---- common_ngrams : {common_ngrams} ----- ')
+    #     print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+    #     mask_indices = []
+    #     # Extract n-gram positions in non-stop words
+    #     ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+    #     print(f' ---- ngram_positions : {ngram_positions} ----- ')
+    #     # Mask a word before the first common n-gram
+    #     if ngram_positions:
+    #         first_ngram_start = ngram_positions[0][0]
+    #         print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+    #         if first_ngram_start > 0:
+    #             mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+    #             print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+    #             mask_indices.append(mask_index_before_ngram)
+    #         # Mask words between common n-grams
+    #         for i in range(len(ngram_positions) - 1):
+    #             end_prev = ngram_positions[i][1]
+    #             print(f' ---- end_prev : {end_prev} ----- ')
+    #             start_next = ngram_positions[i + 1][0]
+    #             print(f' ---- start_next : {start_next} ----- ')
+    #             if start_next > end_prev + 1:
+    #                 mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+    #                 print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+    #                 mask_indices.append(mask_index_between_ngrams)
+    #         # Mask a word after the last common n-gram
+    #         last_ngram_end = ngram_positions[-1][1]
+    #         print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+    #         if last_ngram_end < len(non_stop_words) - 1:
+    #             mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+    #             print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+    #             mask_indices.append(mask_index_after_ngram)
+    #     # Map mask indices back to original sentence
+    #     adjusted_indices = [
+    #         orig for orig, non_stop in enumerate(original_words)
+    #         if non_stop in mask_indices
+    #     ]
+    #     # Apply masks to the original sentence
+    #     for idx in adjusted_indices:
+    #         original_words[idx] = self.tokenizer.mask_token
+    #     return " ".join(original_words)

utils/old/masking/masking_methods_ok_working.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self, ):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords):
+        """
+        Adjust indices of common n-grams after removing stop words.
+        Args:
+            words (list): List of words in the original sentence.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams and their indices.
+        """
+        if not remove_stopwords:
+            return common_ngrams
+        non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = non_stop_word_indices.index(start)
+                    new_end = non_stop_word_indices.index(end)
+                    adjusted_positions.append((new_start, new_end))
+                except ValueError:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (random selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            if first_ngram_start > 0:
+                mask_indices.append(random.randint(0, first_ngram_start - 1))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            if start_next > end_prev + 1:
+                mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        if last_ngram_end < len(non_stop_words) - 1:
+            mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        for idx in mask_indices:
+            if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                non_stop_words[idx] = self.tokenizer.mask_token
+                original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (highest entropy selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        entropy_scores = {}
+        for idx, word in enumerate(non_stop_words):
+            if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip words in common n-grams
+            masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:]
+            masked_sentence = " ".join(masked_sentence)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            filtered_logits = logits[0, mask_token_index, :]
+            probs = torch.softmax(filtered_logits, dim=-1)
+            entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()  # Add epsilon to prevent log(0)
+            entropy_scores[idx] = entropy
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores]
+        if candidates:
+            mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        for idx in mask_indices:
+            non_stop_words[idx] = self.tokenizer.mask_token
+            original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False):
+        """
+        Process a list of sentences and calculate logits for masked tokens using the specified method.
+        Args:
+            original_sentences (list): List of original sentences
+            result_dict (dict): Common n-grams and their indices for each sentence
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and their logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords)
+            elif method == "entropy":
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords)
+            else:
+                raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+# Example usage
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown animals leap over lazy obstacles."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+        "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+        "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+    }
+    # result_dict = {
+    #     "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    # }
+    processor = MaskingProcessor()
+    results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_random.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        print('--------------------------------')
+        for mask_idx, logits in output["mask_logits"].items():
+            print(f"Logits for [MASK] at position {mask_idx}:")
+            print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+    # print('--------------------------------')
+    # for sentence, output in results_entropy.items():
+    #     print(f"Original Sentence (Entropy): {sentence}")
+    #     print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+    #     # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+    #     print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+    #     print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+    #     print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')

utils/old/masking/masking_methods_v1_working.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS
+    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords=False):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (random selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        if remove_stopwords:
+            words = original_sentence.split()
+            words = [word for word in words if word not in self.stop_words]
+        else:
+            words = original_sentence.split()
+        mask_indices = []
+        # Handle before the first common n-gram
+        if common_ngrams:
+            first_ngram_start = list(common_ngrams.values())[0][0][0]
+            if first_ngram_start > 0:
+                mask_indices.append(random.randint(0, first_ngram_start - 1))
+        # Handle between common n-grams
+        ngram_positions = list(common_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            if start_next > end_prev + 1:
+                mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        if last_ngram_end < len(words) - 1:
+            mask_indices.append(random.randint(last_ngram_end + 1, len(words) - 1))
+        # Mask the chosen indices
+        for idx in mask_indices:
+            if idx not in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                words[idx] = self.tokenizer.mask_token
+        return " ".join(words)
+    def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords=False):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (highest entropy selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        if remove_stopwords:
+            words = original_sentence.split()
+            words = [word for word in words if word not in self.stop_words]
+        else:
+            words = original_sentence.split()
+        entropy_scores = {}
+        for idx, word in enumerate(words):
+            if idx in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip words in common n-grams
+            masked_sentence = words[:idx] + [self.tokenizer.mask_token] + words[idx + 1:]
+            masked_sentence = " ".join(masked_sentence)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            filtered_logits = logits[0, mask_token_index, :]
+            probs = torch.softmax(filtered_logits, dim=-1)
+            entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()  # Add epsilon to prevent log(0)
+            entropy_scores[idx] = entropy
+        mask_indices = []
+        # Handle before the first common n-gram
+        if common_ngrams:
+            first_ngram_start = list(common_ngrams.values())[0][0][0]
+            candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle between common n-grams
+        ngram_positions = list(common_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        candidates = [i for i in range(last_ngram_end + 1, len(words)) if i in entropy_scores]
+        if candidates:
+            mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Mask the chosen indices
+        for idx in mask_indices:
+            words[idx] = self.tokenizer.mask_token
+        return " ".join(words)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, original_sentences, result_dict, remove_stopwords=False, method="random"):
+        """
+        Process a list of sentences and calculate logits for masked tokens using the specified method.
+        Args:
+            original_sentences (list): List of original sentences
+            result_dict (dict): Common n-grams and their indices for each sentence
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and their logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams)
+            elif method == "entropy":
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams)
+            else:
+                raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+# Example usage
+if __name__ == "__main__":
+    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown animals leap over lazy obstacles."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+        "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+        "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+    }
+    # result_dict = {
+    #     "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    # }
+    processor = MaskingProcessor()
+    results_random = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="random")
+    results_entropy = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="entropy")
+    for sentence, output in results_random.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+    for sentence, output in results_entropy.items():
+        print(f"Original Sentence (Entropy): {sentence}")
+        print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+        # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+'''
+ result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+        "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+        "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    }
+'''

utils/old/masking_methods_final_copy.py ADDED Viewed

	@@ -0,0 +1,619 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+from transformers import RobertaTokenizer, RobertaForMaskedLM
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    # def __init__(self, tokenizer, model):
+    def __init__(self):
+        # self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        # self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        # self.tokenizer = tokenizer
+        # self.model = model
+        self.tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+        self.model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+        # self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        # self.model = RobertaForMaskedLM.from_pretrained("roberta-base")
+        self.stop_words = set(stopwords.words('english'))
+    def remove_stopwords(self, words):
+        """
+        Remove stopwords from the given list of words.
+        Args:
+            words (list): List of words.
+        Returns:
+            list: List of non-stop words.
+        """
+        return [word for word in words if word.lower() not in self.stop_words]
+    def adjust_ngram_indices(self, original_words, common_ngrams):
+        """
+        Adjust indices of common n-grams after removing stopwords.
+        Args:
+            original_words (list): Original list of words.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams with updated indices.
+        """
+        non_stop_words = self.remove_stopwords(original_words)
+        original_to_non_stop = []
+        non_stop_idx = 0
+        for original_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                original_to_non_stop.append((original_idx, non_stop_idx))
+                non_stop_idx += 1
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+                    new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+                    adjusted_positions.append((new_start, new_end))
+                except StopIteration:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        # Split sentence into words
+        original_words = sentence.split()
+        # Handle punctuation at the end
+        has_punctuation = False
+        punctuation = None
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        print(f' ---- original_words : {original_words} ----- ')
+        # Process words without punctuation
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Rest of the existing function code...
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+            # masked_words[idx] = '<mask>' # for roberta
+        # Add back punctuation if it existed
+        if has_punctuation:
+            masked_words.append(punctuation)
+        print(f' ***** masked_words at end  : {masked_words} ***** ')
+        print(f' ***** original_mask_indices : {original_mask_indices} ***** ')
+        print(f' ***** TESTING : {" ".join(masked_words)} ***** ')
+        return " ".join(masked_words), original_mask_indices
+    def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        # Split sentence into words
+        random.seed(3)
+        original_words = sentence.split()
+        # Handle punctuation at the end
+        has_punctuation = False
+        punctuation = None
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        print(f' ---- original_words : {original_words} ----- ')
+        # Process words without punctuation
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Rest of the existing function code...
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+            # masked_words[idx] = '<mask>' # for roberta
+        # Add back punctuation if it existed
+        if has_punctuation:
+            masked_words.append(punctuation)
+        print(f' ***** masked_words at end  : {masked_words} ***** ')
+        print(f' ***** original_mask_indices : {original_mask_indices} ***** ')
+        print(f' ***** TESTING : {" ".join(masked_words)} ***** ')
+        return " ".join(masked_words), original_mask_indices
+    def calculate_word_entropy(self, sentence, word_position):
+        """
+        Calculate entropy for a specific word position in the sentence.
+        Args:
+            sentence (str): The input sentence
+            word_position (int): Position of the word to calculate entropy for
+        Returns:
+            float: Entropy value for the word
+        """
+        words = sentence.split()
+        masked_words = words.copy()
+        masked_words[word_position] = self.tokenizer.mask_token
+        masked_sentence = " ".join(masked_words)
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        # Get probabilities for the masked position
+        probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+        # Calculate entropy: -sum(p * log(p))
+        entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+        return entropy.item()
+    def mask_sentence_entropy(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on entropy, following n-gram positioning rules.
+        Args:
+            sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        # Split sentence into words
+        original_words = sentence.split()
+        # Handle punctuation at the end
+        has_punctuation = False
+        punctuation = None
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        # Process words without punctuation
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        original_to_non_stop = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                original_to_non_stop[orig_idx] = non_stop_idx
+                non_stop_idx += 1
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        mask_indices = []
+        if ngram_positions:
+            # Handle words before first n-gram
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                candidate_positions = range(0, first_ngram_start)
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words between n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    candidate_positions = range(end_prev + 1, start_next)
+                    entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                                for pos in candidate_positions]
+                    mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words after last n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+        # Map mask indices to original sentence positions and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        # Add back punctuation if it existed
+        if has_punctuation:
+            masked_words.append(punctuation)
+        return " ".join(masked_words), original_mask_indices
+    def calculate_mask_logits(self, original_sentence, original_mask_indices):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            original_sentence (str): Original sentence without masks
+            original_mask_indices (list): List of indices to mask
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        print('==========================================================================================================')
+        words = original_sentence.split()
+        print(f' ##### calculate_mask_logits >> words : {words} ##### ')
+        mask_logits = {}
+        for idx in original_mask_indices:
+            # Create a copy of words and mask the current position
+            print(f' ---- idx : {idx} ----- ')
+            masked_words = words.copy()
+            masked_words[idx] = '[MASK]'
+            # masked_words[idx] = '<mask>' # for roberta
+            masked_sentence = " ".join(masked_words)
+            print(f' ---- masked_sentence : {masked_sentence} ----- ')
+            # Calculate logits for the current mask
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            # Extract logits for the masked position
+            mask_logits_tensor = logits[0, mask_token_index, :]
+            # Get top logits and corresponding tokens
+            top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1)  # Get more candidates
+            # Convert token IDs to words and filter out subword tokens
+            top_tokens = []
+            top_logits = []
+            seen_words = set()  # To keep track of unique words
+            for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]):
+                token = self.tokenizer.convert_ids_to_tokens(token_id.item())
+                # Skip if it's a subword token (starts with ##)
+                if token.startswith('##'):
+                    continue
+                # Convert token to proper word
+                word = self.tokenizer.convert_tokens_to_string([token]).strip()
+                # Only add if it's a new word and not empty
+                if word and word not in seen_words:
+                    seen_words.add(word)
+                    top_tokens.append(word)
+                    top_logits.append(logit.item())
+                    # Break if we have 50 unique complete words
+                    if len(top_tokens) == 50:
+                        break
+            # print(f' ---- top_tokens : {top_tokens} ----- ')
+            # Store results
+            mask_logits[idx] = {
+                "tokens": top_tokens,
+                "logits": top_logits
+            }
+        return mask_logits
+    # def calculate_mask_logits(self, original_sentence, original_mask_indices):
+    #     """
+    #     Calculate logits for masked tokens in the sentence using BERT.
+    #     Args:
+    #         original_sentence (str): Original sentence without masks
+    #         original_mask_indices (list): List of indices to mask
+    #     Returns:
+    #         dict: Masked token indices and their logits
+    #     """
+    #     words = original_sentence.split()
+    #     print(f' ##### calculate_mask_logits >> words : {words} ##### ')
+    #     mask_logits = {}
+    #     for idx in original_mask_indices:
+    #         # Create a copy of words and mask the current position
+    #         print(f' ---- idx : {idx} ----- ')
+    #         masked_words = words.copy()
+    #         print(f' ---- words : {masked_words} ----- ')
+    #         # masked_words[idx] = self.tokenizer.mask_token
+    #         masked_words[idx] = '[MASK]'
+    #         print(f' ---- masked_words : {masked_words} ----- ')
+    #         masked_sentence = " ".join(masked_words)
+    #         print(f' ---- masked_sentence : {masked_sentence} ----- ')
+    #         # Calculate logits for the current mask
+    #         input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+    #         mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+    #         with torch.no_grad():
+    #             outputs = self.model(input_ids)
+    #             logits = outputs.logits
+    #         # Extract logits for the masked position
+    #         mask_logits_tensor = logits[0, mask_token_index, :]
+    #         # Get top 50 logits and corresponding tokens
+    #         top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 50, dim=-1)
+    #         # Convert token IDs to words
+    #         top_tokens = [self.tokenizer.convert_ids_to_tokens(token_id.item()) for token_id in top_mask_indices[0]]
+    #         print(f' ---- top_tokens : {top_tokens} ----- ')
+    #         # Store results
+    #         mask_logits[idx] = {
+    #             "tokens": top_tokens,
+    #             "logits": top_mask_logits.tolist()
+    #         }
+    #     return mask_logits
+    def process_sentences(self, sentences, result_dict, method="random"):
+        """
+        Process sentences and calculate logits for masked tokens.
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            # Split punctuation from the last word before processing
+            words = sentence.split()
+            last_word = words[-1]
+            if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+                # Split the last word and punctuation
+                words[-1] = last_word[:-1]
+                punctuation = last_word[-1]
+                # Rejoin with space before punctuation to treat it as separate token
+                processed_sentence = " ".join(words) + " " + punctuation
+            else:
+                processed_sentence = sentence
+            if method == "random":
+                masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams)
+            elif method == "pseudorandom":
+                masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams)
+            else:  # entropy
+                masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams)
+            logits = self.calculate_mask_logits(processed_sentence, original_mask_indices)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over small cat the lazy dog everyday again and again .",
+        # "A speedy brown fox jumps over a lazy dog.",
+        # "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over small cat the lazy dog everyday again and again .': {'brown fox': [(2, 3)],'cat': [(7, 7)], 'dog': [(10, 10)]},
+        # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    processor = MaskingProcessor()
+    # results_random = processor.process_sentences(sentences, result_dict)
+    results_entropy = processor.process_sentences(sentences, result_dict, method="random")
+    '''
+        results structure :
+        results = {
+                    "The quick brown fox jumps over the lazy dog everyday.":
+                    {  # Original sentence as key
+                        "masked_sentence": str,  # The sentence with [MASK] tokens
+                        "mask_logits":
+                        {  # Dictionary of mask positions and their predictions
+                            1:
+                                {  # Position of mask in sentence
+                                    "tokens" (words) : list,  # List of top 50 predicted tokens
+                                    "logits" (probabilities) : list   # Corresponding logits for those tokens
+                                },
+                            7:
+                                {
+                                    "tokens" (words) : list,
+                                    "logits" (probabilities) : list
+                                },
+                            10:
+                                {
+                                    "tokens (words)": list,
+                                    "logits (probabilities)": list
+                                }
+                        }
+                    }
+                }
+    '''
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_entropy.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        # print('--------------------------------')
+        # for mask_idx, logits in output["mask_logits"].items():
+        #     print(f"Logits for [MASK] at position {mask_idx}:")
+        #     print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+        #     print(f' len(logits) : {len(logits)}')
+# ------------------------------------------------------------------------------------------------
+    # def mask_sentence_random(self, sentence, common_ngrams):
+    #     """
+    #     Mask words in the sentence based on the specified rules after removing stopwords.
+    #     """
+    #     original_words = sentence.split()
+    #     # print(f' ---- original_words : {original_words} ----- ')
+    #     non_stop_words = self.remove_stopwords(original_words)
+    #     # print(f' ---- non_stop_words : {non_stop_words} ----- ')
+    #     adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+    #     # print(f' ---- common_ngrams : {common_ngrams} ----- ')
+    #     # print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+    #     mask_indices = []
+    #     # Extract n-gram positions in non-stop words
+    #     ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+    #     # Mask a word before the first common n-gram
+    #     if ngram_positions:
+    #         # print(f' ---- ngram_positions : {ngram_positions} ----- ')
+    #         first_ngram_start = ngram_positions[0][0]
+    #         # print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+    #         if first_ngram_start > 0:
+    #             mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+    #             # print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+    #             mask_indices.append(mask_index_before_ngram)
+    #         # Mask words between common n-grams
+    #         for i in range(len(ngram_positions) - 1):
+    #             end_prev = ngram_positions[i][1]
+    #             # print(f' ---- end_prev : {end_prev} ----- ')
+    #             start_next = ngram_positions[i + 1][0]
+    #             # print(f' ---- start_next : {start_next} ----- ')
+    #             if start_next > end_prev + 1:
+    #                 mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+    #                 # print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+    #                 mask_indices.append(mask_index_between_ngrams)
+    #         # Mask a word after the last common n-gram
+    #         last_ngram_end = ngram_positions[-1][1]
+    #         if last_ngram_end < len(non_stop_words) - 1:
+    #             # print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+    #             mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+    #             # print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+    #             mask_indices.append(mask_index_after_ngram)
+    #     # Create mapping from non-stop words to original indices
+    #     non_stop_to_original = {}
+    #     non_stop_idx = 0
+    #     for orig_idx, word in enumerate(original_words):
+    #         if word.lower() not in self.stop_words:
+    #             non_stop_to_original[non_stop_idx] = orig_idx
+    #             non_stop_idx += 1
+    #     # Map mask indices from non-stop word positions to original positions
+    #     # print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+    #     original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+    #     # print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+    #     # Apply masks to the original sentence
+    #     masked_words = original_words.copy()
+    #     for idx in original_mask_indices:
+    #         masked_words[idx] = self.tokenizer.mask_token
+    #     return " ".join(masked_words), original_mask_indices

utils/old/non_melting_points_v1.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import nltk
+from nltk.corpus import stopwords
+from nltk.util import ngrams
+from collections import Counter
+import re
+class NgramProcessor:
+    def __init__(self):
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
+            nltk.download('stopwords')
+        self.stop_words = set(stopwords.words('english'))
+    def remove_stopwords(self, text):
+        """
+        Remove stopwords using NLTK's stopword list
+        Args:
+            text (str): Input text
+        Returns:
+            str: Cleaned text with stopwords removed
+        """
+        words = re.findall(r'\w+', text.lower())
+        filtered_words = [word for word in words if word not in self.stop_words]
+        return ' '.join(filtered_words)
+    def is_exact_match(self, ngram, sentences):
+        """
+        Check if the given n-gram has an exact match in all sentences
+        Args:
+            ngram (str): The n-gram to search for
+            sentences (list): List of sentences to search in
+        Returns:
+            bool: True if n-gram has exact match in all sentences, False otherwise
+        """
+        return all(ngram in sentence for sentence in sentences)
+    def is_substring_of_any(self, ngram, common_ngrams):
+        """
+        Check if the given n-gram is an exact substring of any previously found common n-grams
+        Args:
+            ngram (str): The n-gram to check
+            common_ngrams (list): List of previously found common n-grams
+        Returns:
+            bool: True if ngram is a substring of any common_ngrams, False otherwise
+        """
+        return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+    def find_filtered_ngrams(self, sentences):
+        """
+        Find all n-grams that have exact matches across all sentences,
+        excluding those that are part of larger common n-grams
+        Args:
+            sentences (list): List of sentences to analyze
+        Returns:
+            list: List of tuples where each tuple contains the n-gram and its indices in each sentence
+        """
+        original_sentences = sentences[:]
+        sentences = [self.remove_stopwords(sentence) for sentence in sentences]
+        ngram_lengths = [4, 3, 2, 1]  # Quadgram, trigram, bigram, unigram
+        common_ngrams = []
+        for n in ngram_lengths:
+            ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
+            ngrams_counter = Counter(ngrams_list[0])
+            for ngram in ngrams_counter:
+                ngram_str = ' '.join(ngram)
+                if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]):
+                    indices = []
+                    for original_sentence in original_sentences:
+                        words = original_sentence.split()
+                        ngram_indices = [
+                            (i, i + n - 1) for i in range(len(words) - n + 1)
+                            if ' '.join(words[i:i + n]).lower() == ngram_str
+                        ]
+                        indices.append(ngram_indices)
+                    common_ngrams.append((ngram_str, indices))
+        return common_ngrams
+    def find_relative_order(self, sentence, common_ngrams):
+        """
+        Find the relative order of the common n-grams in the sentence
+        Args:
+            sentence (str): Sentence in which to find the relative order
+            common_ngrams (list): List of common n-grams
+        Returns:
+            list: List of tuples with the relative position and the n-gram
+        """
+        relative_order = []
+        for ngram, _ in common_ngrams:
+            index = sentence.find(ngram)
+            if index != -1:
+                relative_order.append((index, ngram))
+        return sorted(relative_order)
+# Example usage
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown animals leap over lazy obstacles."
+    ]
+    processor = NgramProcessor()
+    common_ngrams = processor.find_filtered_ngrams(sentences)
+    print("Common n-grams and their indices:")
+    for ngram, indices in common_ngrams:
+        print(f"{ngram}: {indices}")
+    for sentence in sentences:
+        relative_order = processor.find_relative_order(sentence, common_ngrams)
+        print(f"Relative order in sentence '{sentence}':", relative_order)
+# import nltk
+# from nltk.corpus import stopwords
+# from nltk.util import ngrams
+# from collections import Counter
+# import re
+# class NgramProcessor:
+#     def __init__(self):
+#         try:
+#             nltk.data.find('corpora/stopwords')
+#         except LookupError:
+#             nltk.download('stopwords')
+#         self.stop_words = set(stopwords.words('english'))
+#     def remove_stopwords(self, text):
+#         """
+#         Remove stopwords using NLTK's stopword list
+#         Args:
+#             text (str): Input text
+#         Returns:
+#             str: Cleaned text with stopwords removed
+#         """
+#         words = re.findall(r'\w+', text.lower())
+#         filtered_words = [word for word in words if word not in self.stop_words]
+#         return ' '.join(filtered_words)
+#     def is_exact_match(self, ngram, sentences):
+#         """
+#         Check if the given n-gram has an exact match in all sentences
+#         Args:
+#             ngram (str): The n-gram to search for
+#             sentences (list): List of sentences to search in
+#         Returns:
+#             bool: True if n-gram has exact match in all sentences, False otherwise
+#         """
+#         return all(ngram in sentence for sentence in sentences)
+#     def is_substring_of_any(self, ngram, common_ngrams):
+#         """
+#         Check if the given n-gram is an exact substring of any previously found common n-grams
+#         Args:
+#             ngram (str): The n-gram to check
+#             common_ngrams (list): List of previously found common n-grams
+#         Returns:
+#             bool: True if ngram is a substring of any common_ngrams, False otherwise
+#         """
+#         return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+#     def find_filtered_ngrams(self, sentences):
+#         """
+#         Find all n-grams that have exact matches across all sentences,
+#         excluding those that are part of larger common n-grams
+#         Args:
+#             sentences (list): List of sentences to analyze
+#         Returns:
+#             list: List of all common n-grams in order of their appearance in the first sentence
+#         """
+#         sentences = [self.remove_stopwords(sentence) for sentence in sentences]
+#         ngram_lengths = [4, 3, 2, 1]  # Quadgram, trigram, bigram, unigram
+#         common_ngrams = []
+#         for n in ngram_lengths:
+#             ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
+#             ngrams_counter = Counter(ngrams_list[0])
+#             for ngram in ngrams_counter:
+#                 ngram_str = ' '.join(ngram)
+#                 if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams):
+#                     common_ngrams.append(ngram_str)
+#         return common_ngrams
+#     def find_relative_order(self, sentence, common_ngrams):
+#         """
+#         Find the relative order of the common n-grams in the sentence
+#         Args:
+#             sentence (str): Sentence in which to find the relative order
+#             common_ngrams (list): List of common n-grams
+#         Returns:
+#             list: List of tuples with the relative position and the n-gram
+#         """
+#         relative_order = []
+#         for ngram in common_ngrams:
+#             index = sentence.find(ngram)
+#             if index != -1:
+#                 relative_order.append((index, ngram))
+#         return sorted(relative_order)
+# # Example usage
+# if __name__ == "__main__":
+#     sentences = [
+#         "The quick brown fox jumps over the lazy dog.",
+#         "A quick brown dog outpaces a lazy fox.",
+#         "Quick brown animals leap over lazy obstacles."
+#     ]
+#     processor = NgramProcessor()
+#     common_ngrams = processor.find_filtered_ngrams(sentences)
+#     print("Common n-grams:", common_ngrams)
+#     for sentence in sentences:
+#         relative_order = processor.find_relative_order(sentence, common_ngrams)
+#         print(f"Relative order in sentence '{sentence}':", relative_order)

utils/old/sampling/sampling.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import torch
+import random
+from masking_methods import MaskingProcessor
+import nltk
+from nltk.corpus import words
+import torch.nn.functional as F
+class SamplingProcessor:
+    def __init__(self, tokenizer):
+        """
+        Initialize the SamplingProcessor.
+        Args:
+            tokenizer: BERT tokenizer instance
+        """
+        self.tokenizer = tokenizer
+        self.subtoken_prefix = self._get_subtoken_prefix()
+        self.subtoken_ids = self._get_subtoken_ids()
+        try:
+            nltk.data.find('corpora/words')
+        except LookupError:
+            nltk.download('words')
+        self.english_words = set(words.words())
+    # def _get_subtoken_prefix(self):
+    #     """
+    #     Identify the subtoken prefix based on the tokenizer.
+    #     Returns:
+    #         str: The prefix used for subtokens (e.g., "##" for BERT).
+    #     """
+    #     # This method assumes that the tokenizer uses a consistent subtoken prefix.
+    #     # Adjust accordingly if using different tokenizers.
+    #     # For BERT's WordPiece tokenizer:
+    #     if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs:
+    #         return self.tokenizer.init_kwargs["wordpiece_prefix"]
+    #     elif hasattr(self.tokenizer, "prefix_tokens"):
+    #         return self.tokenizer.prefix_tokens
+    #     else:
+    #         # Default to BERT's subtoken prefix
+    #         return "##"
+    def _get_subtoken_prefix(self):
+        """
+        Identify the subtoken prefix based on the tokenizer.
+        Returns:
+            str: The prefix used for subtokens (e.g., "##" for BERT).
+        """
+        # This method assumes that the tokenizer uses a consistent subtoken prefix.
+        # Adjust accordingly if using different tokenizers.
+        # For BERT's WordPiece tokenizer:
+        if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs:
+            return self.tokenizer.init_kwargs["wordpiece_prefix"]
+        elif hasattr(self.tokenizer, "prefix_tokens"):
+            return self.tokenizer.prefix_tokens
+        else:
+            # Default to BERT's subtoken prefix
+            return "##"
+    # def _get_subtoken_ids(self):
+    #     """
+    #     Retrieve all token IDs that correspond to subtokens.
+    #     Returns:
+    #         set: A set of subtoken IDs.
+    #     """
+    #     vocab = self.tokenizer.get_vocab()
+    #     subtoken_ids = set()
+    #     for token, idx in vocab.items():
+    #         if token.startswith(self.subtoken_prefix):
+    #             subtoken_ids.add(idx)
+    #     return subtoken_ids
+    def _get_subtoken_ids(self):
+        """
+        Retrieve all token IDs that correspond to subtokens.
+        Returns:
+            list: A list of subtoken IDs.
+        """
+        vocab = self.tokenizer.get_vocab()
+        subtoken_ids = []
+        for token, idx in vocab.items():
+            if token.startswith(self.subtoken_prefix):
+                subtoken_ids.append(idx)
+        return subtoken_ids  # Changed from set to list
+    def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+        tokens = self.tokenizer.tokenize(masked_sentence)
+        for mask_pos in sorted(mask_logits_dict.keys()):
+            try:
+                # Get logits and squeeze extra dimension
+                mask_logits = torch.tensor(mask_logits_dict[mask_pos]).squeeze(0)  # Remove the extra dimension
+                # Create a mask for valid tokens (no special tokens, no subwords)
+                valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool)
+                for idx in range(len(mask_logits)):
+                    token = self.tokenizer.convert_ids_to_tokens([idx])[0]
+                    # Only allow regular words (no special tokens, no subwords)
+                    if token.isalpha() and not token.startswith('[') and not token.startswith('##'):
+                        valid_mask[idx] = True
+                # Get valid logits
+                valid_logits = mask_logits[valid_mask]
+                valid_indices = torch.where(valid_mask)[0]
+                if len(valid_logits) == 0:
+                    print(f"Warning: No valid tokens found for position {mask_pos}")
+                    continue
+                if sampling_technique == "inverse_transform":
+                    probs = torch.softmax(valid_logits / temperature, dim=-1)
+                    cumulative_probs = torch.cumsum(probs, dim=-1)
+                    random_prob = random.random()
+                    sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                elif sampling_technique == "exponential_minimum":
+                    probs = torch.softmax(valid_logits / temperature, dim=-1)
+                    exp_probs = torch.exp(-torch.log(probs))
+                    random_probs = torch.rand_like(exp_probs)
+                    sampled_idx = torch.argmax(random_probs * exp_probs).item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                elif sampling_technique == "temperature":
+                    valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8)
+                    probs = torch.softmax(valid_logits / temperature, dim=-1)
+                    if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                        raise ValueError("The computed probabilities contain NaN or inf values.")
+                    probs = torch.max(probs, torch.tensor(1e-8))
+                    probs = probs / torch.sum(probs)
+                    sampled_idx = torch.multinomial(probs, 1)[0].item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                elif sampling_technique == 'greedy':
+                    sampled_idx = torch.argmax(valid_logits).item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                # Replace mask with sampled token
+                sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+                tokens[mask_pos] = sampled_token
+            except Exception as e:
+                print(f"Error sampling for position {mask_pos}: {str(e)}")
+                continue
+        return self.tokenizer.convert_tokens_to_string(tokens)
+    def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+        """
+        Process all masked sentences in the results dictionary.
+        Args:
+            results_dict (dict): Dictionary containing masked sentences and their logits
+            sampling_technique (str): Sampling method to use
+            temperature (float): Temperature parameter for sampling
+        Returns:
+            dict: Dictionary containing original, masked, and sampled sentences
+        """
+        processed_results = {}
+        for original_sentence, data in results_dict.items():
+            masked_sentence = data["masked_sentence"]
+            mask_logits = data["mask_logits"]
+            sampled_sentence = self.sample_tokens(
+                mask_logits,
+                masked_sentence,
+                sampling_technique,
+                temperature
+            )
+            processed_results[original_sentence] = {
+                "masked_sentence": masked_sentence,
+                "sampled_sentence": sampled_sentence
+            }
+        return processed_results
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog everyday.",
+    ]
+    result_dict = {
+        'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+    }
+    # First, mask the sentences
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict)
+    # Then, sample replacements for the masks
+    sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+    # Try different sampling techniques
+    sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+    for technique in sampling_techniques:
+        print(f"\nSampling using {technique}:")
+        sampled_results = sampling_processor.process_masked_sentences(
+            masking_results,
+            sampling_technique=technique,
+            temperature=1.0
+        )
+        for original_sentence, result in sampled_results.items():
+            print(f"Original:  {original_sentence}")
+            print(f"Masked:    {result['masked_sentence']}")
+            print(f"Sampled:   {result['sampled_sentence']}")
+            print("---")
+# --------------------------------------------------------------------------------------------------
+    # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100):
+    #     words = masked_sentence.split()
+    #     mask_positions = sorted(mask_logits_dict.keys())
+    #     for mask_pos in mask_positions:
+    #         mask_logits = torch.tensor(mask_logits_dict[mask_pos])
+    #         try:
+    #             if sampling_technique == "inverse_transform":
+    #                 probs = torch.softmax(mask_logits / temperature, dim=-1)
+    #                 cumulative_probs = torch.cumsum(probs, dim=-1)
+    #                 random_prob = random.random()
+    #                 sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+    #             elif sampling_technique == "exponential_minimum":
+    #                 probs = torch.softmax(mask_logits / temperature, dim=-1)
+    #                 exp_probs = torch.exp(-torch.log(probs))
+    #                 random_probs = torch.rand_like(exp_probs)
+    #                 sampled_index = torch.argmax(random_probs * exp_probs).item()
+    #             elif sampling_technique == "temperature":
+    #                 mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+    #                 probs = torch.softmax(mask_logits / temperature, dim=-1)
+    #                 if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+    #                     raise ValueError("The computed probabilities contain NaN or inf values.")
+    #                 probs = torch.max(probs, torch.tensor(1e-8))
+    #                 probs = probs / torch.sum(probs)
+    #                 sampled_index = torch.multinomial(probs, 1)[0].item()
+    #             elif sampling_technique == 'greedy':
+    #                 sampled_index = torch.argmax(mask_logits).item()
+    #             else:
+    #                 raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+    #             # Replace mask with sampled token
+    #             sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+    #             words[mask_pos] = sampled_token
+    #         except Exception as e:
+    #             print(f"Error sampling for position {mask_pos}: {str(e)}")
+    #             continue
+    #     return " ".join(words)
+    ## MORE WEIRD RESULTS
+    # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100):
+    #     words = masked_sentence.split()
+    #     mask_positions = sorted(mask_logits_dict.keys())
+    #     for mask_pos in mask_positions:
+    #         mask_logits = torch.tensor(mask_logits_dict[mask_pos])
+    #         try:
+    #             # Create a mask for valid tokens (no special tokens, no subwords)
+    #             valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool)
+    #             for idx in range(len(mask_logits)):
+    #                 token = self.tokenizer.convert_ids_to_tokens([idx])[0]
+    #                 # Only allow regular words (no special tokens, no subwords)
+    #                 if token.isalpha() and not token.startswith('[') and not token.startswith('##'):
+    #                     valid_mask[idx] = True
+    #             # Get valid logits
+    #             valid_logits = mask_logits[valid_mask]
+    #             valid_indices = torch.where(valid_mask)[0]
+    #             if len(valid_logits) == 0:
+    #                 print(f"Warning: No valid tokens found for position {mask_pos}")
+    #                 continue
+    #             if sampling_technique == "inverse_transform":
+    #                 probs = torch.softmax(valid_logits / temperature, dim=-1)
+    #                 cumulative_probs = torch.cumsum(probs, dim=-1)
+    #                 random_prob = random.random()
+    #                 sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             elif sampling_technique == "exponential_minimum":
+    #                 probs = torch.softmax(valid_logits / temperature, dim=-1)
+    #                 exp_probs = torch.exp(-torch.log(probs))
+    #                 random_probs = torch.rand_like(exp_probs)
+    #                 sampled_idx = torch.argmax(random_probs * exp_probs).item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             elif sampling_technique == "temperature":
+    #                 valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8)
+    #                 probs = torch.softmax(valid_logits / temperature, dim=-1)
+    #                 if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+    #                     raise ValueError("The computed probabilities contain NaN or inf values.")
+    #                 probs = torch.max(probs, torch.tensor(1e-8))
+    #                 probs = probs / torch.sum(probs)
+    #                 sampled_idx = torch.multinomial(probs, 1)[0].item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             elif sampling_technique == 'greedy':
+    #                 sampled_idx = torch.argmax(valid_logits).item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             else:
+    #                 raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+    #             # Replace mask with sampled token
+    #             sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+    #             words[mask_pos] = sampled_token
+    #         except Exception as e:
+    #             print(f"Error sampling for position {mask_pos}: {str(e)}")
+    #             continue
+    #     return " ".join(words)

utils/old/sampling/sampling_methods.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from transformers import BertTokenizer, BertForMaskedLM
+import torch
+import random
+from masking_methods import MaskingProcessor
+from transformers import pipeline
+class SamplingProcessorWithModel:
+    def __init__(self, model_name='bert-base-uncased'):
+        self.tokenizer = BertTokenizer.from_pretrained(model_name)
+        self.model = BertForMaskedLM.from_pretrained(model_name)
+        self.model.eval()  # Set the model to evaluation mode
+    def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        input_ids = self.tokenizer.encode(masked_sentence, return_tensors="pt")
+        while self.tokenizer.mask_token_id in input_ids[0]:
+            # Find indices of all [MASK] tokens
+            mask_indices = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            # Process the first [MASK] token in the sequence
+            mask_index = mask_indices[0].item()
+            # Get logits from the model
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            # Extract logits for the [MASK] token
+            mask_logits = logits[0, mask_index]
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(mask_logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(mask_logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+                probs = torch.softmax(mask_logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=mask_logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(mask_logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            # Replace the first [MASK] with the selected token
+            input_ids[0, mask_index] = sampled_index
+        return self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        while '[MASK]' in masked_sentence:
+            # Get predictions for the first [MASK]
+            predictions = self.unmasker(masked_sentence)
+            # Ensure predictions is a list of dictionaries
+            if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+                raise ValueError("Unexpected structure in predictions from the pipeline.")
+            # Extract logits (scores) from the predictions
+            logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                logits = torch.clamp(logits, min=-1e8, max=1e8)
+                probs = torch.softmax(logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            # Replace the first [MASK] with the selected word
+            sampled_token = predictions[sampled_index]['token_str']
+            masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+        return masked_sentence
+# Example usage
+if __name__ == "__main__":
+    from transformers import BertTokenizer
+    # Define sentences and result_dict
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown dog leaps over lazy the fox."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+        "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+        "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+    }
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # Use SamplingProcessor
+    sampling_processor = SamplingProcessorWithModel()
+    # Iterate through masking results to apply sampling
+    for sentence, result in masking_results.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {result['masked_sentence']}")
+        masked_sentence = result["masked_sentence"]
+        # Apply different sampling techniques
+        for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+            print(f"Sampling Technique: {technique}")
+            filled_sentence = sampling_processor.fill_masked_sentence(
+                masked_sentence=masked_sentence,
+                sampling_technique=technique,
+                temperature=1.0  # Adjust temperature as needed
+            )
+            print(f"Filled Sentence: {filled_sentence}\n")
+        print('--------------------------------')
+# from transformers import pipeline
+# import torch
+# import random
+# from masking_methods import MaskingProcessor
+# class SamplingProcessorWithPipeline:
+#     def __init__(self, model_name='bert-base-uncased'):
+#         self.unmasker = pipeline('fill-mask', model=model_name)
+#         self.tokenizer = self.unmasker.tokenizer
+#     def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+#         """
+#         Fills each mask in the masked sentence using the specified sampling technique.
+#         Args:
+#             masked_sentence (str): Sentence with [MASK] tokens.
+#             sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+#             temperature (float): Temperature parameter for sampling methods.
+#         Returns:
+#             str: Sentence with the masks filled.
+#         """
+#         while '[MASK]' in masked_sentence:
+#             # Get predictions for the first [MASK]
+#             predictions = self.unmasker(masked_sentence)
+#             print(f' predictions : {predictions}')
+#             print(f' type of predictions : {type(predictions)}')
+#             # Ensure predictions is a list of dictionaries for the first [MASK]
+#             if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+#                 raise ValueError("Unexpected structure in predictions from the pipeline.")
+#             # Extract logits (scores) from the predictions
+#             logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+#             if sampling_technique == "inverse_transform":
+#                 probs = torch.softmax(logits / temperature, dim=-1)
+#                 cumulative_probs = torch.cumsum(probs, dim=-1)
+#                 random_prob = random.random()
+#                 sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+#             elif sampling_technique == "exponential_minimum":
+#                 probs = torch.softmax(logits / temperature, dim=-1)
+#                 exp_probs = torch.exp(-torch.log(probs))
+#                 random_probs = torch.rand_like(exp_probs)
+#                 sampled_index = torch.argmax(random_probs * exp_probs).item()
+#             elif sampling_technique == "temperature":
+#                 logits = torch.clamp(logits, min=-1e8, max=1e8)
+#                 probs = torch.softmax(logits / temperature, dim=-1)
+#                 if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+#                     raise ValueError("The computed probabilities contain NaN or inf values.")
+#                 probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+#                 probs = probs / torch.sum(probs)
+#                 probs = probs.flatten()
+#                 if probs.size(0) > 1:
+#                     sampled_index = torch.multinomial(probs, 1).item()
+#                 else:
+#                     sampled_index = torch.argmax(probs).item()
+#             elif sampling_technique == 'greedy':
+#                 sampled_index = torch.argmax(logits).item()
+#             else:
+#                 raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+#             # Replace the first [MASK] with the selected word
+#             sampled_token = predictions[sampled_index]['token_str']
+#             masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+#         return masked_sentence
+# # Example usage
+# if __name__ == "__main__":
+#     from transformers import BertTokenizer
+#     # Define sentences and result_dict
+#     sentences = [
+#         "The quick brown fox jumps over the lazy dog.",
+#         "A quick brown dog outpaces a lazy fox.",
+#         "Quick brown animals leap over lazy obstacles."
+#     ]
+#     result_dict = {
+#         "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+#         "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+#         "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+#     }
+#     masking_processor = MaskingProcessor()
+#     masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+#     # Use SamplingProcessor
+#     sampling_processor = SamplingProcessorWithPipeline()
+#     # Iterate through masking results to apply sampling
+#     for sentence, result in masking_results.items():
+#         print(f"Original Sentence (Random): {sentence}")
+#         print(f"Masked Sentence (Random): {result['masked_sentence']}")
+#         masked_sentence = result["masked_sentence"]
+#         # Apply different sampling techniques
+#         for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+#             print(f"Sampling Technique: {technique}")
+#             filled_sentence = sampling_processor.fill_masked_sentence(
+#                 masked_sentence=masked_sentence,
+#                 sampling_technique=technique,
+#                 temperature=1.0  # Adjust temperature as needed
+#             )
+#             print(f"Filled Sentence: {filled_sentence}\n")
+#         print('--------------------------------')

utils/old/sampling/sampling_methods_v1.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import random
+from masking_methods import MaskingProcessor
+class SamplingProcessor:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def fill_masked_sentence(self, original_sentence, mask_logits, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            original_sentence (str): The original masked sentence.
+            mask_logits (dict): Logits for each [MASK] token.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        sentence_tokens = self.tokenizer.tokenize(original_sentence)
+        mask_token_indices = [i for i, token in enumerate(sentence_tokens) if token == self.tokenizer.mask_token]
+        if len(mask_token_indices) != len(mask_logits):
+            raise ValueError("Mismatch between number of [MASK] tokens and logits provided.")
+        for mask_idx, filtered_logits in zip(mask_token_indices, mask_logits.values()):
+            # Convert logits to a tensor
+            filtered_logits = torch.tensor(filtered_logits)
+            # filtered_logits, _ = torch.sort(filtered_logits, descending=True)
+            # print(f' type of filtered_logits : {type(filtered_logits)}')
+            # filtered_logits = filtered_logits[:5]
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(filtered_logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(filtered_logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                filtered_logits = torch.clamp(filtered_logits, min=-1e8, max=1e8)
+                probs = torch.softmax(filtered_logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=filtered_logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(filtered_logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+            sentence_tokens[mask_idx] = sampled_token
+        return self.tokenizer.convert_tokens_to_string(sentence_tokens)
+    def process_samples(self, masked_sentences, mask_logits, sampling_technique, temperature=1.0):
+        """
+        Process multiple masked sentences and fill their masks using the specified sampling technique.
+        Args:
+            masked_sentences (list): List of masked sentences.
+            mask_logits (dict): Logits for each [MASK] token in each sentence.
+            sampling_technique (str): Sampling technique to use.
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            list: List of sentences with masks filled.
+        """
+        filled_sentences = []
+        for sentence, logits in zip(masked_sentences, mask_logits):
+            filled_sentence = self.fill_masked_sentence(sentence, logits, sampling_technique, temperature)
+            filled_sentences.append(filled_sentence)
+        return filled_sentences
+# Example usage
+if __name__ == "__main__":
+    from transformers import BertTokenizer
+    # tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    processor = SamplingProcessor(tokenizer)
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown dog leaps over lazy the fox."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+        "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+        "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+    }
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # masked_sentence = "The [MASK] brown fox jumps [MASK] the lazy dog."
+    # mask_logits = {
+    #     1: torch.randn(len(tokenizer)),  # Example logits for first [MASK]
+    #     5: torch.randn(len(tokenizer)),  # Example logits for second [MASK]
+    # }
+    # Iterate through masking results to apply sampling
+    for sentence, result in masking_results.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {result['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        print(f' type(result["mask_logits"]) : {type(result["mask_logits"])}')
+        print(f' length of result["mask_logits"] : {len(result["mask_logits"])}')
+        print(f' result["mask_logits"].keys() : {result["mask_logits"].keys()}')
+        masked_sentence = result["masked_sentence"]
+        mask_logits = result["mask_logits"]
+        print(f"Original Masked Sentence: {masked_sentence}")
+        # Apply different sampling techniques
+        for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+            print(f"Sampling Technique: {technique}")
+            # Fill the masks using the sampling processor
+            filled_sentence = processor.fill_masked_sentence(
+                original_sentence=masked_sentence,
+                mask_logits=mask_logits,
+                sampling_technique=technique,
+                temperature=1.0  # Adjust temperature as needed
+            )
+            print(f"Filled Sentence: {filled_sentence}\n")
+        print('--------------------------------')

utils/old/sampling/sampling_methods_v2.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from transformers import pipeline
+import torch
+import random
+from masking_methods import MaskingProcessor
+class SamplingProcessorWithPipeline:
+    def __init__(self, model_name='bert-base-uncased'):
+        self.unmasker = pipeline('fill-mask', model=model_name)
+        self.tokenizer = self.unmasker.tokenizer
+    def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        while '[MASK]' in masked_sentence:
+            # Get predictions for the first [MASK]
+            predictions = self.unmasker(masked_sentence)
+            print(f' predictions : {predictions}')
+            print(f' type of predictions : {type(predictions)}')
+            # Ensure predictions is a list of dictionaries
+            if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+                raise ValueError("Unexpected structure in predictions from the pipeline.")
+            # Extract logits (scores) from the predictions
+            logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                logits = torch.clamp(logits, min=-1e8, max=1e8)
+                probs = torch.softmax(logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            # Replace the first [MASK] with the selected word
+            sampled_token = predictions[sampled_index]['token_str']
+            masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+        return masked_sentence
+# Example usage
+if __name__ == "__main__":
+    from transformers import BertTokenizer
+    # Define sentences and result_dict
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown dog leaps over lazy the fox."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+        "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+        "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+    }
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # Use SamplingProcessor
+    sampling_processor = SamplingProcessorWithPipeline()
+    # Iterate through masking results to apply sampling
+    for sentence, result in masking_results.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {result['masked_sentence']}")
+        masked_sentence = result["masked_sentence"]
+        # Apply different sampling techniques
+        for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+            print(f"Sampling Technique: {technique}")
+            filled_sentence = sampling_processor.fill_masked_sentence(
+                masked_sentence=masked_sentence,
+                sampling_technique=technique,
+                temperature=1.0  # Adjust temperature as needed
+            )
+            print(f"Filled Sentence: {filled_sentence}\n")
+        print('--------------------------------')

utils/old/sampling_final_copy.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import random
+from masking_methods import MaskingProcessor
+class SamplingProcessor:
+    def __init__(self, tokenizer):
+        """
+        Initialize the SamplingProcessor.
+        Args:
+            tokenizer: BERT tokenizer instance
+        """
+        self.tokenizer = tokenizer
+    def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+        """
+        Sample tokens for each mask in the sentence using the specified sampling technique.
+        Args:
+            mask_logits_dict (dict): Dictionary of mask positions and their logits/tokens
+            masked_sentence (str): Sentence with [MASK] tokens
+            sampling_technique (str): Sampling method to use
+            temperature (float): Temperature parameter for sampling
+        Returns:
+            str: Sentence with sampled tokens replacing masks
+        """
+        words = masked_sentence.split()
+        # Convert positions and logits to sorted list to process masks in order
+        mask_positions = sorted(mask_logits_dict.keys())
+        for mask_pos in mask_positions:
+            mask_data = mask_logits_dict[mask_pos]
+            mask_logits = torch.tensor(mask_data['logits'])
+            candidate_tokens = mask_data['tokens']
+            try:
+                if sampling_technique == "inverse_transform":
+                    probs = torch.softmax(mask_logits / temperature, dim=-1)
+                    cumulative_probs = torch.cumsum(probs, dim=-1)
+                    random_prob = random.random()
+                    sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+                elif sampling_technique == "exponential_minimum":
+                    probs = torch.softmax(mask_logits / temperature, dim=-1)
+                    exp_probs = torch.exp(-torch.log(probs))
+                    random_probs = torch.rand_like(exp_probs)
+                    sampled_index = torch.argmax(random_probs * exp_probs).item()
+                elif sampling_technique == "temperature":
+                    mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+                    probs = torch.softmax(mask_logits / temperature, dim=-1)
+                    if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                        raise ValueError("The computed probabilities contain NaN or inf values.")
+                    probs = torch.max(probs, torch.tensor(1e-8))
+                    probs = probs / torch.sum(probs)
+                    probs = probs.flatten()
+                    if probs.size(0) > 1:
+                        sampled_index = torch.multinomial(probs, 1).item()
+                    else:
+                        sampled_index = torch.argmax(probs).item()
+                elif sampling_technique == 'greedy':
+                    sampled_index = torch.argmax(mask_logits).item()
+                else:
+                    raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+                # Use the sampled index to get the corresponding token
+                sampled_token = candidate_tokens[sampled_index]
+                # Remove ## if it's a subword token
+                sampled_token = sampled_token.replace('##', '')
+                words[mask_pos] = sampled_token
+            except Exception as e:
+                print(f"Error sampling for position {mask_pos}: {str(e)}")
+                continue
+        return " ".join(words)
+    def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+        """
+        Process all masked sentences in the results dictionary.
+        Args:
+            results_dict (dict): Dictionary containing masked sentences and their logits
+            sampling_technique (str): Sampling method to use
+            temperature (float): Temperature parameter for sampling
+        Returns:
+            dict: Dictionary containing original, masked, and sampled sentences
+        """
+        processed_results = {}
+        for original_sentence, data in results_dict.items():
+            masked_sentence = data["masked_sentence"]
+            mask_logits = data["mask_logits"]
+            sampled_sentence = self.sample_tokens(
+                mask_logits,
+                masked_sentence,
+                sampling_technique,
+                temperature
+            )
+            processed_results[original_sentence] = {
+                "masked_sentence": masked_sentence,
+                "sampled_sentence": sampled_sentence
+            }
+        return processed_results
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog everyday.",
+        "A speedy brown fox jumps over a lazy dog.",
+        "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    # First, mask the sentences
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict)
+    # Then, sample replacements for the masks
+    sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+    # Try different sampling techniques
+    sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+    for technique in sampling_techniques:
+        print(f"\nSampling using {technique}:")
+        sampled_results = sampling_processor.process_masked_sentences(
+            masking_results,
+            sampling_technique=technique,
+            temperature=1.0
+        )
+        '''
+            {
+                "original_sentence_1":
+                {
+                    "masked_sentence": "sentence with [MASK] tokens",
+                    "sampling_method1": "sentence with sampled tokens",
+                },
+                "original_sentence_2":
+                {
+                    "masked_sentence": "sentence with [MASK] tokens",
+                    "sampling_method": "sentence with sampled tokens"
+                },
+                # ... and so on for each input sentence
+            },
+        '''
+        for original_sentence, result in sampled_results.items():
+            print(f"Original:  {original_sentence}")
+            print(f"Masked:    {result['masked_sentence']}")
+            print(f"Sampled:   {result['sampled_sentence']}")
+            print("---")

utils/paraphraser.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+This file contains the code to generate paraphrases of sentences.
+"""
+import os
+import sys
+import logging
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from tqdm import tqdm  # for progress bars
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from utils.config import load_config
+# config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+# config = load_config(config_path)['PECCAVI_TEXT']['Paraphrase']
+# Configure logging to show only warnings or above on the terminal.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+class Paraphraser:
+    """
+    Paraphraser class to generate paraphrases of sentences.
+    """
+    def __init__(self, config):
+        self.config = config
+        import torch
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        tqdm.write(f"[Paraphraser] Initializing on device: {self.device}")
+        self.tokenizer = AutoTokenizer.from_pretrained(config['tokenizer'])
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(config['model']).to(self.device)
+        self.num_beams = config['num_beams']
+        self.num_beam_groups = config['num_beam_groups']
+        self.num_return_sequences = config['num_return_sequences']
+        self.repetition_penalty = config['repetition_penalty']
+        self.diversity_penalty = config['diversity_penalty']
+        self.no_repeat_ngram_size = config['no_repeat_ngram_size']
+        self.temperature = config['temperature']
+        self.max_length = config['max_length']
+    def paraphrase(self, sentence: str, num_return_sequences: int=None, num_beams: int=None, num_beam_groups: int=None):
+        tqdm.write(f"[Paraphraser] Starting paraphrase for sentence: {sentence}")
+        if num_return_sequences is None:
+            num_return_sequences = self.num_return_sequences
+        if num_beams is None:
+            num_beams = self.num_beams
+        if num_beam_groups is None:
+            num_beam_groups = self.num_beam_groups
+        inputs = self.tokenizer.encode("paraphrase: " + sentence,
+                                       return_tensors="pt",
+                                       max_length=self.max_length,
+                                       truncation=True).to(self.device)
+        outputs = self.model.generate(
+            inputs,
+            max_length=self.max_length,
+            num_beams=num_beams,
+            num_beam_groups=num_beam_groups,
+            num_return_sequences=num_return_sequences,
+            repetition_penalty=self.repetition_penalty,
+            diversity_penalty=self.diversity_penalty,
+            no_repeat_ngram_size=self.no_repeat_ngram_size,
+            temperature=self.temperature
+        )
+        paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True)
+                       for output in tqdm(outputs, desc="Decoding Paraphrases")]
+        tqdm.write(f"[Paraphraser] Paraphrase completed. {len(paraphrases)} outputs generated.")
+        return paraphrases
+if __name__ == "__main__":
+    config_path = '/home/jigyasu/PECCAVI-Text/utils/config.yaml'
+    config = load_config(config_path)
+    paraphraser = Paraphraser(config['PECCAVI_TEXT']['Paraphrase'])
+    sentence = "The quick brown fox jumps over the lazy dog."
+    paraphrases = paraphraser.paraphrase(sentence)
+    for paraphrase in paraphrases:
+        print(paraphrase)