Spaces:

minishlab
/

semantic-deduplication

Runtime error

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

95530b9

1 Parent(s): c58907b

Updates

Browse files

Files changed (1) hide show

app.py +284 -1420

app.py CHANGED Viewed

@@ -5,79 +5,72 @@ from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
-# Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# Default dataset parameters
-default_dataset1_name = "sst2"
-default_dataset1_split = "train"
-default_dataset2_name = "sst2"
-default_dataset2_split = "validation"
 default_text_column = "sentence"
 default_threshold = 0.9
-# Load the default datasets at startup
-ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 def batch_iterable(iterable, batch_size):
-    """Helper function to create batches from an iterable."""
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
-def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
     total_batches = (len(texts) + batch_size - 1) // batch_size
     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
-        batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-        embeddings.append(batch_embeddings)
         progress((i + 1) / total_batches, desc=desc)
     return np.concatenate(embeddings, axis=0)
-def deduplicate(
-    embedding_matrix: np.ndarray,
-    threshold: float,
     batch_size: int = 1024,
     progress=None
-) -> tuple[np.ndarray, dict[int, int]]:
-    # Building the index
-    progress(0, desc="Building search index...")
-    reach = Reach(
-        vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
-    )
-    deduplicated_indices = set(range(len(embedding_matrix)))
-    duplicate_to_original_mapping = {}
-    # Finding nearest neighbors
-    progress(0, desc="Finding nearest neighbors...")
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False,  # Disable internal progress bar
-    )
-    # Processing duplicates with a progress bar
-    total_items = len(embedding_matrix)
-    for i, similar_items in enumerate(
-        progress.tqdm(results, desc="Processing duplicates", total=total_items)
-    ):
-        if i not in deduplicated_indices:
-            continue
-        similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-        for sim_idx in similar_indices:
-            if sim_idx in deduplicated_indices:
-                deduplicated_indices.remove(sim_idx)
-                duplicate_to_original_mapping[sim_idx] = i
-    return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
-    return " ".join([word for word in diff if word.startswith(("+", "-"))])
 def perform_deduplication(
     deduplication_type,
@@ -91,208 +84,86 @@ def perform_deduplication(
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
-        # Convert threshold to float
         threshold = float(threshold)
-        # Initialize status message
-        status = ""
         if deduplication_type == "Single dataset":
-            # Load Dataset 1
-            status = "Loading Dataset 1..."
-            yield status, ""
-            if (
-                dataset1_name == default_dataset1_name
-                and dataset1_split == default_dataset1_split
-            ):
-                ds = ds_default1
-            else:
-                ds = load_dataset(dataset1_name, split=dataset1_split)
-            # Extract texts
-            status = "Extracting texts from Dataset 1..."
-            yield status, ""
-            texts = [example[dataset1_text_column] for example in ds]
-            # Compute embeddings
-            status = "Computing embeddings for Dataset 1..."
-            yield status, ""
-            embedding_matrix = compute_embeddings(
-                texts,
-                batch_size=64,
-                progress=progress,
-                desc="Computing embeddings for Dataset 1",
-            )
-            # Deduplicate
-            status = "Deduplicating embeddings..."
-            yield status, ""
-            deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-                embedding_matrix, threshold, progress=progress
             )
-            # Prepare the results
-            num_duplicates = len(duplicate_to_original_mapping)
-            num_total = len(texts)
-            num_deduplicated = len(deduplicated_indices)
-            result_text = f"**Total documents:** {num_total}\n"
-            result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-            result_text += (
-                f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
             )
-            # Show deduplicated examples
             if num_duplicates > 0:
-                result_text += "**Examples of duplicates found:**\n\n"
-                num_examples = min(5, num_duplicates)
-                for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-                    original_text = texts[original_idx]
-                    duplicate_text = texts[duplicate_idx]
-                    differences = display_word_differences(original_text, duplicate_text)
-                    result_text += f"**Original text:**\n{original_text}\n\n"
-                    result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-                    result_text += f"**Differences:**\n{differences}\n"
-                    result_text += "-" * 50 + "\n\n"
             else:
                 result_text += "No duplicates found."
-            # Final status
-            status = "Deduplication completed."
-            yield status, result_text
-        elif deduplication_type == "Cross-dataset":
-            # Similar code for cross-dataset deduplication
-            # Load Dataset 1
-            status = "Loading Dataset 1..."
-            yield status, ""
-            if (
-                dataset1_name == default_dataset1_name
-                and dataset1_split == default_dataset1_split
-            ):
-                ds1 = ds_default1
-            else:
-                ds1 = load_dataset(dataset1_name, split=dataset1_split)
-            # Load Dataset 2
-            status = "Loading Dataset 2..."
-            yield status, ""
-            if (
-                dataset2_name == default_dataset2_name
-                and dataset2_split == default_dataset2_split
-            ):
-                ds2 = ds_default2
-            else:
-                ds2 = load_dataset(dataset2_name, split=dataset2_split)
-            # Extract texts from Dataset 1
-            status = "Extracting texts from Dataset 1..."
-            yield status, ""
-            texts1 = [example[dataset1_text_column] for example in ds1]
-            # Extract texts from Dataset 2
-            status = "Extracting texts from Dataset 2..."
-            yield status, ""
-            texts2 = [example[dataset2_text_column] for example in ds2]
-            # Compute embeddings for Dataset 1
-            status = "Computing embeddings for Dataset 1..."
-            yield status, ""
-            embedding_matrix1 = compute_embeddings(
-                texts1,
-                batch_size=64,
-                progress=progress,
-                desc="Computing embeddings for Dataset 1",
-            )
-            # Compute embeddings for Dataset 2
-            status = "Computing embeddings for Dataset 2..."
-            yield status, ""
-            embedding_matrix2 = compute_embeddings(
-                texts2,
-                batch_size=64,
-                progress=progress,
-                desc="Computing embeddings for Dataset 2",
             )
-            # Deduplicate across datasets
-            status = "Deduplicating embeddings across datasets..."
-            yield status, ""
-            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-                embedding_matrix1, embedding_matrix2, threshold, progress=progress
             )
-            num_duplicates = len(duplicate_indices_in_ds2)
-            num_total_ds2 = len(texts2)
-            num_unique_ds2 = num_total_ds2 - num_duplicates
-            result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-            result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-            result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-            # Show deduplicated examples
             if num_duplicates > 0:
-                result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-                num_examples = min(5, num_duplicates)
-                for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-                    original_idx = duplicate_to_original_mapping[duplicate_idx]
-                    original_text = texts1[original_idx]
-                    duplicate_text = texts2[duplicate_idx]
-                    differences = display_word_differences(original_text, duplicate_text)
-                    result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-                    result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-                    result_text += f"**Differences:**\n{differences}\n"
-                    result_text += "-" * 50 + "\n\n"
             else:
                 result_text += "No duplicates found."
-            # Final status
-            status = "Deduplication completed."
-            yield status, result_text
     except Exception as e:
         yield f"An error occurred: {e}", ""
         raise e
-def deduplicate_across_datasets(
-    embedding_matrix_1: np.ndarray,
-    embedding_matrix_2: np.ndarray,
-    threshold: float,
-    batch_size: int = 1024,
-    progress=None
-) -> tuple[list[int], dict[int, int]]:
-    # Building the index from Dataset 1
-    progress(0, desc="Building search index from Dataset 1...")
-    reach = Reach(
-        vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
-    )
-    duplicate_indices_in_test = []
-    duplicate_to_original_mapping = {}
-    # Finding nearest neighbors between datasets
-    progress(0, desc="Finding nearest neighbors between datasets...")
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix_2,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False,  # Disable internal progress bar
-    )
-    total_items = len(embedding_matrix_2)
-    # Processing duplicates with a progress bar
-    for i, similar_items in enumerate(
-        progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
-    ):
-        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-        if similar_indices:
-            duplicate_indices_in_test.append(i)
-            duplicate_to_original_mapping[i] = similar_indices[0]
-    return duplicate_indices_in_test, duplicate_to_original_mapping
-# Adjust the height of the status_output component using custom CSS
 with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
@@ -303,38 +174,27 @@ with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     )
     with gr.Row():
-        dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-        dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     dataset2_inputs = gr.Column(visible=False)
     with dataset2_inputs:
         gr.Markdown("### Dataset 2")
         with gr.Row():
-            dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-            dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-    threshold = gr.Slider(
-        minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
-    )
     compute_button = gr.Button("Compute")
-    # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
     status_output = gr.Markdown(elem_id="status_output")
     result_output = gr.Markdown()
-    # Function to update the visibility of dataset2_inputs
-    def update_visibility(deduplication_type_value):
-        if deduplication_type_value == "Cross-dataset":
-            return gr.update(visible=True)
-        else:
-            return gr.update(visible=False)
-    deduplication_type.change(
-        update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
-    )
     compute_button.click(
         fn=perform_deduplication,
@@ -353,19 +213,17 @@ with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 # from model2vec import StaticModel
 # from reach import Reach
 # from difflib import ndiff
-# import tqdm
 # # Load the model at startup
 # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # Update default dataset to 'sst2' and set default threshold to 0.9
 # default_dataset1_name = "sst2"
 # default_dataset1_split = "train"
 # default_dataset2_name = "sst2"
@@ -384,29 +242,42 @@ demo.launch()
 # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
 #     embeddings = []
-#     for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
-#         batch_embeddings = model.encode(batch, show_progressbar=False)
 #         embeddings.append(batch_embeddings)
 #     return np.concatenate(embeddings, axis=0)
-# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-#     """
-#     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-#     """
-#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=False
 #     )
 #     total_items = len(embedding_matrix)
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 #         if i not in deduplicated_indices:
 #             continue
@@ -419,35 +290,9 @@ demo.launch()
 #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-#     """
-#     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-#     """
-#     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-#     duplicate_indices_in_test = []
-#     duplicate_to_original_mapping = {}
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix_2,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=False
-#     )
-#     total_items = len(embedding_matrix_2)
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-#         if similar_indices:
-#             duplicate_indices_in_test.append(i)
-#             duplicate_to_original_mapping[i] = similar_indices[0]
-#     return duplicate_indices_in_test, duplicate_to_original_mapping
 # def display_word_differences(x: str, y: str) -> str:
 #     diff = ndiff(x.split(), y.split())
-#     return " ".join([word for word in diff if word.startswith(('+', '-'))])
 # def perform_deduplication(
 #     deduplication_type,
@@ -458,26 +303,61 @@ demo.launch()
 #     dataset2_split="",
 #     dataset2_text_column="",
 #     threshold=default_threshold,
-#     progress=gr.Progress(track_tqdm=True)
 # ):
 #     try:
 #         threshold = float(threshold)
 #         if deduplication_type == "Single dataset":
-#             ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
-#             texts = [example[dataset1_text_column] for example in ds]
-#             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text = f"**Total documents:** {num_total}\n"
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-#             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                 num_examples = min(5, num_duplicates)
@@ -492,19 +372,70 @@ demo.launch()
 #             else:
 #                 result_text += "No duplicates found."
-#             yield result_text
 #         elif deduplication_type == "Cross-dataset":
-#             ds1 = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
-#             ds2 = ds_default2 if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split else load_dataset(dataset2_name, split=dataset2_split)
 #             texts1 = [example[dataset1_text_column] for example in ds1]
-#             texts2 = [example[dataset2_text_column] for example in ds2]
-#             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-#             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
-#             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 #             num_duplicates = len(duplicate_indices_in_ds2)
 #             num_total_ds2 = len(texts2)
@@ -514,6 +445,7 @@ demo.launch()
 #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 #                 num_examples = min(5, num_duplicates)
@@ -529,19 +461,60 @@ demo.launch()
 #             else:
 #                 result_text += "No duplicates found."
-#             yield result_text
 #     except Exception as e:
 #         yield f"An error occurred: {e}", ""
-# # Adjust the height of the status_output and result_output components
-# with gr.Blocks(css="#status_output { height: 300px; overflow: auto; } #result_output { height: 300px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     deduplication_type = gr.Radio(
 #         choices=["Single dataset", "Cross-dataset"],
 #         label="Deduplication Type",
-#         value="Single dataset"
 #     )
 #     with gr.Row():
@@ -558,17 +531,16 @@ demo.launch()
 #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 #     threshold = gr.Slider(
-#         minimum=0.0,
-#         maximum=1.0,
-#         value=default_threshold,
-#         label="Similarity Threshold"
 #     )
 #     compute_button = gr.Button("Compute")
 #     status_output = gr.Markdown(elem_id="status_output")
-#     result_output = gr.Markdown(elem_id="result_output")
 #     def update_visibility(deduplication_type_value):
 #         if deduplication_type_value == "Cross-dataset":
 #             return gr.update(visible=True)
@@ -576,9 +548,7 @@ demo.launch()
 #             return gr.update(visible=False)
 #     deduplication_type.change(
-#         update_visibility,
-#         inputs=deduplication_type,
-#         outputs=dataset2_inputs
 #     )
 #     compute_button.click(
@@ -591,1115 +561,9 @@ demo.launch()
 #             dataset2_name,
 #             dataset2_split,
 #             dataset2_text_column,
-#             threshold
 #         ],
-#         outputs=[status_output, result_output]
 #     )
 # demo.launch()
-# # import gradio as gr
-# # from datasets import load_dataset
-# # import numpy as np
-# # from model2vec import StaticModel
-# # from reach import Reach
-# # from difflib import ndiff
-# # import tqdm
-# # # Load the model at startup
-# # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # # Update default dataset to 'sst2' and set default threshold to 0.9
-# # default_dataset1_name = "sst2"
-# # default_dataset1_split = "train"
-# # default_dataset2_name = "sst2"
-# # default_dataset2_split = "validation"
-# # default_text_column = "sentence"
-# # default_threshold = 0.9
-# # # Load the default datasets at startup
-# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # def batch_iterable(iterable, batch_size):
-# #     """Helper function to create batches from an iterable."""
-# #     for i in range(0, len(iterable), batch_size):
-# #         yield iterable[i:i + batch_size]
-# # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-# #     embeddings = []
-# #     for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
-# #         batch_embeddings = model.encode(batch, show_progressbar=False)
-# #         embeddings.append(batch_embeddings)
-# #     return np.concatenate(embeddings, axis=0)
-# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-# #     """
-# #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-# #     """
-# #     # Building the index
-# #     progress(0, desc="Building search index...")
-# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-# #     deduplicated_indices = set(range(len(embedding_matrix)))
-# #     duplicate_to_original_mapping = {}
-# #     # Finding nearest neighbors
-# #     progress(0, desc="Finding nearest neighbors...")
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=False  # Disable internal progress bar
-# #     )
-# #     # Processing duplicates with a progress bar
-# #     total_items = len(embedding_matrix)
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-# #         if i not in deduplicated_indices:
-# #             continue
-# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-# #         for sim_idx in similar_indices:
-# #             if sim_idx in deduplicated_indices:
-# #                 deduplicated_indices.remove(sim_idx)
-# #                 duplicate_to_original_mapping[sim_idx] = i
-# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-# #     """
-# #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-# #     """
-# #     # Building the index from Dataset 1
-# #     progress(0, desc="Building search index from Dataset 1...")
-# #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-# #     duplicate_indices_in_test = []
-# #     duplicate_to_original_mapping = {}
-# #     # Finding nearest neighbors between datasets
-# #     progress(0, desc="Finding nearest neighbors between datasets...")
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix_2,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=False  # Disable internal progress bar
-# #     )
-# #     total_items = len(embedding_matrix_2)
-# #     # Processing duplicates with a progress bar
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-# #         if similar_indices:
-# #             duplicate_indices_in_test.append(i)
-# #             duplicate_to_original_mapping[i] = similar_indices[0]
-# #     return duplicate_indices_in_test, duplicate_to_original_mapping
-# # def display_word_differences(x: str, y: str) -> str:
-# #     diff = ndiff(x.split(), y.split())
-# #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
-# # def perform_deduplication(
-# #     deduplication_type,
-# #     dataset1_name,
-# #     dataset1_split,
-# #     dataset1_text_column,
-# #     dataset2_name="",
-# #     dataset2_split="",
-# #     dataset2_text_column="",
-# #     threshold=default_threshold,
-# #     progress=gr.Progress(track_tqdm=True)
-# # ):
-# #     try:
-# #         # Convert threshold to float
-# #         threshold = float(threshold)
-# #         # Initialize status message
-# #         status = ""
-# #         if deduplication_type == "Single dataset":
-# #             # Load Dataset 1
-# #             status = "Loading Dataset 1..."
-# #             yield status, ""
-# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-# #                 ds = ds_default1
-# #             else:
-# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Extract texts
-# #             status = "Extracting texts from Dataset 1..."
-# #             yield status, ""
-# #             texts = [example[dataset1_text_column] for example in ds]
-# #             # Compute embeddings
-# #             status = "Computing embeddings for Dataset 1..."
-# #             yield status, ""
-# #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-# #             # Deduplicate
-# #             status = "Deduplicating embeddings..."
-# #             yield status, ""
-# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# #                 embedding_matrix, threshold, progress=progress
-# #             )
-# #             # Prepare the results
-# #             num_duplicates = len(duplicate_to_original_mapping)
-# #             num_total = len(texts)
-# #             num_deduplicated = len(deduplicated_indices)
-# #             result_text = f"**Total documents:** {num_total}\n"
-# #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# #             # Show deduplicated examples
-# #             if num_duplicates > 0:
-# #                 result_text += "**Examples of duplicates found:**\n\n"
-# #                 num_examples = min(5, num_duplicates)
-# #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-# #                     original_text = texts[original_idx]
-# #                     duplicate_text = texts[duplicate_idx]
-# #                     differences = display_word_differences(original_text, duplicate_text)
-# #                     result_text += f"**Original text:**\n{original_text}\n\n"
-# #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-# #                     result_text += f"**Differences:**\n{differences}\n"
-# #                     result_text += "-" * 50 + "\n\n"
-# #             else:
-# #                 result_text += "No duplicates found."
-# #             # Final status
-# #             status = "Deduplication completed."
-# #             yield status, result_text
-# #         elif deduplication_type == "Cross-dataset":
-# #             # Load Dataset 1
-# #             status = "Loading Dataset 1..."
-# #             yield status, ""
-# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-# #                 ds1 = ds_default1
-# #             else:
-# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Load Dataset 2
-# #             status = "Loading Dataset 2..."
-# #             yield status, ""
-# #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
-# #                 ds2 = ds_default2
-# #             else:
-# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-# #             # Extract texts from Dataset 1
-# #             status = "Extracting texts from Dataset 1..."
-# #             yield status, ""
-# #             texts1 = [example[dataset1_text_column] for example in ds1]
-# #             # Extract texts from Dataset 2
-# #             status = "Extracting texts from Dataset 2..."
-# #             yield status, ""
-# #             texts2 = [example[dataset2_text_column] for example in ds2]
-# #             # Compute embeddings for Dataset 1
-# #             status = "Computing embeddings for Dataset 1..."
-# #             yield status, ""
-# #             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-# #             # Compute embeddings for Dataset 2
-# #             status = "Computing embeddings for Dataset 2..."
-# #             yield status, ""
-# #             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
-# #             # Deduplicate across datasets
-# #             status = "Deduplicating embeddings across datasets..."
-# #             yield status, ""
-# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
-# #             )
-# #             num_duplicates = len(duplicate_indices_in_ds2)
-# #             num_total_ds2 = len(texts2)
-# #             num_unique_ds2 = num_total_ds2 - num_duplicates
-# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-# #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-# #             # Show deduplicated examples
-# #             if num_duplicates > 0:
-# #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-# #                 num_examples = min(5, num_duplicates)
-# #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-# #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
-# #                     original_text = texts1[original_idx]
-# #                     duplicate_text = texts2[duplicate_idx]
-# #                     differences = display_word_differences(original_text, duplicate_text)
-# #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-# #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-# #                     result_text += f"**Differences:**\n{differences}\n"
-# #                     result_text += "-" * 50 + "\n\n"
-# #             else:
-# #                 result_text += "No duplicates found."
-# #             # Final status
-# #             status = "Deduplication completed."
-# #             yield status, result_text
-# #     except Exception as e:
-# #         yield f"An error occurred: {e}", ""
-# #         raise e
-# # with gr.Blocks() as demo:
-# #     gr.Markdown("# Semantic Deduplication")
-# #     deduplication_type = gr.Radio(
-# #         choices=["Single dataset", "Cross-dataset"],
-# #         label="Deduplication Type",
-# #         value="Single dataset"
-# #     )
-# #     with gr.Row():
-# #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-# #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-# #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     dataset2_inputs = gr.Column(visible=False)
-# #     with dataset2_inputs:
-# #         gr.Markdown("### Dataset 2")
-# #         with gr.Row():
-# #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-# #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-# #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     threshold = gr.Slider(
-# #         minimum=0.0,
-# #         maximum=1.0,
-# #         value=default_threshold,
-# #         label="Similarity Threshold"
-# #     )
-# #     compute_button = gr.Button("Compute")
-# #     status_output = gr.Markdown()
-# #     result_output = gr.Markdown()
-# #     # Function to update the visibility of dataset2_inputs
-# #     def update_visibility(deduplication_type_value):
-# #         if deduplication_type_value == "Cross-dataset":
-# #             return gr.update(visible=True)
-# #         else:
-# #             return gr.update(visible=False)
-# #     deduplication_type.change(
-# #         update_visibility,
-# #         inputs=deduplication_type,
-# #         outputs=dataset2_inputs
-# #     )
-# #     compute_button.click(
-# #         fn=perform_deduplication,
-# #         inputs=[
-# #             deduplication_type,
-# #             dataset1_name,
-# #             dataset1_split,
-# #             dataset1_text_column,
-# #             dataset2_name,
-# #             dataset2_split,
-# #             dataset2_text_column,
-# #             threshold
-# #         ],
-# #         outputs=[status_output, result_output]
-# #     )
-# # demo.launch()
-# # import gradio as gr
-# # from datasets import load_dataset
-# # import numpy as np
-# # import model2vec
-# # from reach import Reach
-# # from difflib import ndiff
-# # # Load the model at startup
-# # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # # Default dataset parameters
-# # default_dataset1_name = "sst2"
-# # default_dataset1_split = "train"
-# # default_dataset2_name = "sst2"
-# # default_dataset2_split = "validation"
-# # default_text_column = "sentence"
-# # default_threshold = 0.9
-# # # Load the default datasets at startup
-# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # def batch_iterable(iterable, batch_size):
-# #     """Helper function to create batches from an iterable."""
-# #     for i in range(0, len(iterable), batch_size):
-# #         yield iterable[i:i + batch_size]
-# # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-# #     embeddings = []
-# #     total_batches = (len(texts) + batch_size - 1) // batch_size
-# #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
-# #         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-# #         embeddings.append(batch_embeddings)
-# #         progress((i + 1) / total_batches, desc=desc)
-# #     return np.concatenate(embeddings, axis=0)
-# # def deduplicate(
-# #     embedding_matrix: np.ndarray,
-# #     threshold: float,
-# #     batch_size: int = 1024,
-# #     progress=None
-# # ) -> tuple[np.ndarray, dict[int, int]]:
-# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-# #     deduplicated_indices = set(range(len(embedding_matrix)))
-# #     duplicate_to_original_mapping = {}
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=False,
-# #     )
-# #     total_items = len(embedding_matrix)
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-# #         if i not in deduplicated_indices:
-# #             continue
-# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-# #         for sim_idx in similar_indices:
-# #             if sim_idx in deduplicated_indices:
-# #                 deduplicated_indices.remove(sim_idx)
-# #                 duplicate_to_original_mapping[sim_idx] = i
-# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # def display_word_differences(x: str, y: str) -> str:
-# #     diff = ndiff(x.split(), y.split())
-# #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# # def perform_deduplication(
-# #     deduplication_type,
-# #     dataset1_name,
-# #     dataset1_split,
-# #     dataset1_text_column,
-# #     dataset2_name="",
-# #     dataset2_split="",
-# #     dataset2_text_column="",
-# #     threshold=default_threshold,
-# #     progress=gr.Progress(track_tqdm=True),
-# # ):
-# #     try:
-# #         threshold = float(threshold)
-# #         if deduplication_type == "Single dataset":
-# #             ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
-# #             texts = [example[dataset1_text_column] for example in ds]
-# #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress)
-# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
-# #             num_duplicates = len(duplicate_to_original_mapping)
-# #             num_total = len(texts)
-# #             num_deduplicated = len(deduplicated_indices)
-# #             result_text = f"**Total documents:** {num_total}\n"
-# #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# #             if num_duplicates > 0:
-# #                 result_text += "**Examples of duplicates found:**\n\n"
-# #                 num_examples = min(5, num_duplicates)
-# #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-# #                     original_text = texts[original_idx]
-# #                     duplicate_text = texts[duplicate_idx]
-# #                     differences = display_word_differences(original_text, duplicate_text)
-# #                     result_text += f"**Original text:**\n{original_text}\n\n"
-# #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-# #                     result_text += f"**Differences:**\n{differences}\n"
-# #                     result_text += "-" * 50 + "\n\n"
-# #             else:
-# #                 result_text += "No duplicates found."
-# #             yield result_text
-# #     except Exception as e:
-# #         yield f"An error occurred: {e}"
-# # # Gradio interface setup
-# # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
-# #     gr.Markdown("# Semantic Deduplication")
-# #     deduplication_type = gr.Radio(
-# #         choices=["Single dataset", "Cross-dataset"],
-# #         label="Deduplication Type",
-# #         value="Single dataset",
-# #     )
-# #     with gr.Row():
-# #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-# #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-# #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     dataset2_inputs = gr.Column(visible=False)
-# #     with dataset2_inputs:
-# #         gr.Markdown("### Dataset 2")
-# #         with gr.Row():
-# #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-# #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-# #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     threshold = gr.Slider(minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold")
-# #     compute_button = gr.Button("Compute")
-# #     result_output = gr.Markdown()
-# #     def update_visibility(deduplication_type_value):
-# #         return gr.update(visible=True) if deduplication_type_value == "Cross-dataset" else gr.update(visible=False)
-# #     deduplication_type.change(
-# #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
-# #     )
-# #     compute_button.click(
-# #         fn=perform_deduplication,
-# #         inputs=[
-# #             deduplication_type,
-# #             dataset1_name,
-# #             dataset1_split,
-# #             dataset1_text_column,
-# #             dataset2_name,
-# #             dataset2_split,
-# #             dataset2_text_column,
-# #             threshold,
-# #         ],
-# #         outputs=[result_output],
-# #     )
-# # demo.launch()
-# # # import gradio as gr
-# # # from datasets import load_dataset
-# # # import numpy as np
-# # # import model2vec
-# # # from reach import Reach
-# # # from difflib import ndiff
-# # # import time
-# # # # Load the model at startup
-# # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # # # Default dataset parameters
-# # # default_dataset1_name = "sst2"
-# # # default_dataset1_split = "train"
-# # # default_dataset2_name = "sst2"
-# # # default_dataset2_split = "validation"
-# # # default_text_column = "sentence"
-# # # default_threshold = 0.9
-# # # # Load the default datasets at startup
-# # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # # def batch_iterable(iterable, batch_size):
-# # #     """Helper function to create batches from an iterable."""
-# # #     for i in range(0, len(iterable), batch_size):
-# # #         yield iterable[i:i + batch_size]
-# # # def log_time(message, start_time=None, logs=None):
-# # #     """Helper function to log the start and end times."""
-# # #     current_time = time.time()
-# # #     if start_time is not None:
-# # #         elapsed = current_time - start_time
-# # #         log_message = f"{message} - Took {elapsed:.2f} seconds"
-# # #     else:
-# # #         log_message = f"{message} - Started"
-# # #     if logs is not None:
-# # #         logs.append(log_message)
-# # # def compute_embeddings(texts, batch_size, progress, logs, desc="Computing embeddings"):
-# # #     embeddings = []
-# # #     total_batches = (len(texts) + batch_size - 1) // batch_size
-# # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
-# # #         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-# # #         embeddings.append(batch_embeddings)
-# # #         progress((i + 1) / total_batches, desc=desc)
-# # #     return np.concatenate(embeddings, axis=0)
-# # # def deduplicate(
-# # #     embedding_matrix: np.ndarray,
-# # #     threshold: float,
-# # #     batch_size: int = 1024,
-# # #     progress=None,
-# # #     logs=None
-# # # ) -> tuple[np.ndarray, dict[int, int]]:
-# # #     # Building the index
-# # #     log_time("Building search index", logs=logs)
-# # #     reach = Reach(
-# # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
-# # #     )
-# # #     deduplicated_indices = set(range(len(embedding_matrix)))
-# # #     duplicate_to_original_mapping = {}
-# # #     # Finding nearest neighbors
-# # #     log_time("Finding nearest neighbors", logs=logs)
-# # #     results = reach.nearest_neighbor_threshold(
-# # #         embedding_matrix,
-# # #         threshold=threshold,
-# # #         batch_size=batch_size,
-# # #         show_progressbar=False,  # Disable internal progress bar
-# # #     )
-# # #     # Processing duplicates with a progress bar
-# # #     total_items = len(embedding_matrix)
-# # #     log_time("Processing duplicates", logs=logs)
-# # #     for i, similar_items in enumerate(
-# # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
-# # #     ):
-# # #         if i not in deduplicated_indices:
-# # #             continue
-# # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-# # #         for sim_idx in similar_indices:
-# # #             if sim_idx in deduplicated_indices:
-# # #                 deduplicated_indices.remove(sim_idx)
-# # #                 duplicate_to_original_mapping[sim_idx] = i
-# # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # # def display_word_differences(x: str, y: str) -> str:
-# # #     diff = ndiff(x.split(), y.split())
-# # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# # # def encode_texts(texts, progress=None, logs=None):
-# # #     embedding_matrix = model.encode(texts, show_progressbar=False)
-# # #     log_time("Encoding texts completed", logs=logs)
-# # #     return embedding_matrix
-# # # def perform_deduplication(
-# # #     deduplication_type,
-# # #     dataset1_name,
-# # #     dataset1_split,
-# # #     dataset1_text_column,
-# # #     dataset2_name="",
-# # #     dataset2_split="",
-# # #     dataset2_text_column="",
-# # #     threshold=default_threshold,
-# # #     progress=gr.Progress(track_tqdm=True),
-# # # ):
-# # #     logs = []  # To store log messages
-# # #     try:
-# # #         # Convert threshold to float
-# # #         threshold = float(threshold)
-# # #         # Initialize status message
-# # #         log_time("Deduplication started", logs=logs)
-# # #         if deduplication_type == "Single dataset":
-# # #             # Load Dataset 1
-# # #             start_time = time.time()
-# # #             log_time("Loading Dataset 1", logs=logs)
-# # #             if (
-# # #                 dataset1_name == default_dataset1_name
-# # #                 and dataset1_split == default_dataset1_split
-# # #             ):
-# # #                 ds = ds_default1
-# # #             else:
-# # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
-# # #             log_time("Loading Dataset 1 completed", start_time=start_time, logs=logs)
-# # #             # Extract texts
-# # #             start_time = time.time()
-# # #             log_time("Extracting texts from Dataset 1", logs=logs)
-# # #             texts = [example[dataset1_text_column] for example in ds]
-# # #             log_time("Extracting texts from Dataset 1 completed", start_time=start_time, logs=logs)
-# # #             # Compute embeddings
-# # #             start_time = time.time()
-# # #             log_time("Computing embeddings for Dataset 1", logs=logs)
-# # #             embedding_matrix = encode_texts(texts, progress=progress, logs=logs)
-# # #             log_time("Computing embeddings for Dataset 1 completed", start_time=start_time, logs=logs)
-# # #             # Deduplicate
-# # #             start_time = time.time()
-# # #             log_time("Deduplicating embeddings", logs=logs)
-# # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# # #                 embedding_matrix, threshold, progress=progress, logs=logs
-# # #             )
-# # #             log_time("Deduplication completed", start_time=start_time, logs=logs)
-# # #             # Prepare the results
-# # #             num_duplicates = len(duplicate_to_original_mapping)
-# # #             num_total = len(texts)
-# # #             num_deduplicated = len(deduplicated_indices)
-# # #             result_text = f"**Total documents:** {num_total}\n"
-# # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# # #             result_text += (
-# # #                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# # #             )
-# # #             # Show deduplicated examples
-# # #             if num_duplicates > 0:
-# # #                 result_text += "**Examples of duplicates found:**\n\n"
-# # #                 num_examples = min(5, num_duplicates)
-# # #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-# # #                     original_text = texts[original_idx]
-# # #                     duplicate_text = texts[duplicate_idx]
-# # #                     differences = display_word_differences(original_text, duplicate_text)
-# # #                     result_text += f"**Original text:**\n{original_text}\n\n"
-# # #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-# # #                     result_text += f"**Differences:**\n{differences}\n"
-# # #                     result_text += "-" * 50 + "\n\n"
-# # #             else:
-# # #                 result_text += "No duplicates found."
-# # #             log_time("Deduplication process finished", logs=logs)
-# # #             full_log = "\n".join(logs)  # Combine all logs into one output
-# # #             yield full_log, result_text
-# # #     except Exception as e:
-# # #         full_log = "\n".join(logs)  # Combine all logs into one output in case of an error
-# # #         yield f"An error occurred: {e}", ""
-# # #         raise e
-# # # # Adjust the height of the status_output component using custom CSS
-# # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
-# # #     gr.Markdown("# Semantic Deduplication")
-# # #     deduplication_type = gr.Radio(
-# # #         choices=["Single dataset", "Cross-dataset"],
-# # #         label="Deduplication Type",
-# # #         value="Single dataset",
-# # #     )
-# # #     with gr.Row():
-# # #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-# # #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-# # #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# # #     dataset2_inputs = gr.Column(visible=False)
-# # #     with dataset2_inputs:
-# # #         gr.Markdown("### Dataset 2")
-# # #         with gr.Row():
-# # #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-# # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-# # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# # #     threshold = gr.Slider(
-# # #         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
-# # #     )
-# # #     compute_button = gr.Button("Compute")
-# # #     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
-# # #     status_output = gr.Markdown(elem_id="status_output")
-# # #     result_output = gr.Markdown()
-# # #     # Function to update the visibility of dataset2_inputs
-# # #     def update_visibility(deduplication_type_value):
-# # #         if deduplication_type_value == "Cross-dataset":
-# # #             return gr.update(visible=True)
-# # #         else:
-# # #             return gr.update(visible=False)
-# # #     deduplication_type.change(
-# # #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
-# # #     )
-# # #     compute_button.click(
-# # #         fn=perform_deduplication,
-# # #         inputs=[
-# # #             deduplication_type,
-# # #             dataset1_name,
-# # #             dataset1_split,
-# # #             dataset1_text_column,
-# # #             dataset2_name,
-# # #             dataset2_split,
-# # #             dataset2_text_column,
-# # #             threshold,
-# # #         ],
-# # #         outputs=[status_output, result_output],
-# # #     )
-# # # demo.launch()
-# # # # import gradio as gr
-# # # # from datasets import load_dataset
-# # # # import numpy as np
-# # # # #from model2vec import StaticModel
-# # # # import model2vec
-# # # # from reach import Reach
-# # # # from difflib import ndiff
-# # # # # Load the model at startup
-# # # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # # # # Default dataset parameters
-# # # # default_dataset1_name = "sst2"
-# # # # default_dataset1_split = "train"
-# # # # default_dataset2_name = "sst2"
-# # # # default_dataset2_split = "validation"
-# # # # default_text_column = "sentence"
-# # # # default_threshold = 0.9
-# # # # # Load the default datasets at startup
-# # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # # # def batch_iterable(iterable, batch_size):
-# # # #     """Helper function to create batches from an iterable."""
-# # # #     for i in range(0, len(iterable), batch_size):
-# # # #         yield iterable[i:i + batch_size]
-# # # # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-# # # #     embeddings = []
-# # # #     total_batches = (len(texts) + batch_size - 1) // batch_size
-# # # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
-# # # #         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-# # # #         embeddings.append(batch_embeddings)
-# # # #         progress((i + 1) / total_batches, desc=desc)
-# # # #     return np.concatenate(embeddings, axis=0)
-# # # # def deduplicate(
-# # # #     embedding_matrix: np.ndarray,
-# # # #     threshold: float,
-# # # #     batch_size: int = 1024,
-# # # #     progress=None
-# # # # ) -> tuple[np.ndarray, dict[int, int]]:
-# # # #     # Building the index
-# # # #     progress(0, desc="Building search index...")
-# # # #     reach = Reach(
-# # # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
-# # # #     )
-# # # #     deduplicated_indices = set(range(len(embedding_matrix)))
-# # # #     duplicate_to_original_mapping = {}
-# # # #     # Finding nearest neighbors
-# # # #     progress(0, desc="Finding nearest neighbors...")
-# # # #     results = reach.nearest_neighbor_threshold(
-# # # #         embedding_matrix,
-# # # #         threshold=threshold,
-# # # #         batch_size=batch_size,
-# # # #         show_progressbar=False,  # Disable internal progress bar
-# # # #     )
-# # # #     # Processing duplicates with a progress bar
-# # # #     total_items = len(embedding_matrix)
-# # # #     for i, similar_items in enumerate(
-# # # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
-# # # #     ):
-# # # #         if i not in deduplicated_indices:
-# # # #             continue
-# # # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-# # # #         for sim_idx in similar_indices:
-# # # #             if sim_idx in deduplicated_indices:
-# # # #                 deduplicated_indices.remove(sim_idx)
-# # # #                 duplicate_to_original_mapping[sim_idx] = i
-# # # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # # # def display_word_differences(x: str, y: str) -> str:
-# # # #     diff = ndiff(x.split(), y.split())
-# # # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# # # # def encode_texts(texts, progress=None):
-# # # #     embedding_matrix = model.encode(texts, show_progressbar=False)
-# # # #     return embedding_matrix
-# # # # def perform_deduplication(
-# # # #     deduplication_type,
-# # # #     dataset1_name,
-# # # #     dataset1_split,
-# # # #     dataset1_text_column,
-# # # #     dataset2_name="",
-# # # #     dataset2_split="",
-# # # #     dataset2_text_column="",
-# # # #     threshold=default_threshold,
-# # # #     progress=gr.Progress(track_tqdm=True),
-# # # # ):
-# # # #     try:
-# # # #         # Convert threshold to float
-# # # #         threshold = float(threshold)
-# # # #         # Initialize status message
-# # # #         status = ""
-# # # #         if deduplication_type == "Single dataset":
-# # # #             # Load Dataset 1
-# # # #             status = "Loading Dataset 1..."
-# # # #             yield status, ""
-# # # #             if (
-# # # #                 dataset1_name == default_dataset1_name
-# # # #                 and dataset1_split == default_dataset1_split
-# # # #             ):
-# # # #                 ds = ds_default1
-# # # #             else:
-# # # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
-# # # #             # Extract texts
-# # # #             status = "Extracting texts from Dataset 1..."
-# # # #             yield status, ""
-# # # #             texts = [example[dataset1_text_column] for example in ds]
-# # # #             # Compute embeddings
-# # # #             status = "Computing embeddings for Dataset 1..."
-# # # #             yield status, ""
-# # # #             embedding_matrix = encode_texts(texts, progress=progress)
-# # # #             #embedding_matrix = model.encode(texts, show_progressbar=True)
-# # # #             # embedding_matrix = compute_embeddings(
-# # # #             #     texts,
-# # # #             #     batch_size=64,
-# # # #             #     progress=progress,
-# # # #             #     desc="Computing embeddings for Dataset 1",
-# # # #             # )
-# # # #             # Deduplicate
-# # # #             status = "Deduplicating embeddings..."
-# # # #             yield status, ""
-# # # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# # # #                 embedding_matrix, threshold, progress=progress
-# # # #             )
-# # # #             # Prepare the results
-# # # #             num_duplicates = len(duplicate_to_original_mapping)
-# # # #             num_total = len(texts)
-# # # #             num_deduplicated = len(deduplicated_indices)
-# # # #             result_text = f"**Total documents:** {num_total}\n"
-# # # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# # # #             result_text += (
-# # # #                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# # # #             )
-# # # #             # Show deduplicated examples
-# # # #             if num_duplicates > 0:
-# # # #                 result_text += "**Examples of duplicates found:**\n\n"
-# # # #                 num_examples = min(5, num_duplicates)
-# # # #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-# # # #                     original_text = texts[original_idx]
-# # # #                     duplicate_text = texts[duplicate_idx]
-# # # #                     differences = display_word_differences(original_text, duplicate_text)
-# # # #                     result_text += f"**Original text:**\n{original_text}\n\n"
-# # # #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-# # # #                     result_text += f"**Differences:**\n{differences}\n"
-# # # #                     result_text += "-" * 50 + "\n\n"
-# # # #             else:
-# # # #                 result_text += "No duplicates found."
-# # # #             # Final status
-# # # #             status = "Deduplication completed."
-# # # #             yield status, result_text
-# # # #         elif deduplication_type == "Cross-dataset":
-# # # #             # Similar code for cross-dataset deduplication
-# # # #             # Load Dataset 1
-# # # #             status = "Loading Dataset 1..."
-# # # #             yield status, ""
-# # # #             if (
-# # # #                 dataset1_name == default_dataset1_name
-# # # #                 and dataset1_split == default_dataset1_split
-# # # #             ):
-# # # #                 ds1 = ds_default1
-# # # #             else:
-# # # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-# # # #             # Load Dataset 2
-# # # #             status = "Loading Dataset 2..."
-# # # #             yield status, ""
-# # # #             if (
-# # # #                 dataset2_name == default_dataset2_name
-# # # #                 and dataset2_split == default_dataset2_split
-# # # #             ):
-# # # #                 ds2 = ds_default2
-# # # #             else:
-# # # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-# # # #             # Extract texts from Dataset 1
-# # # #             status = "Extracting texts from Dataset 1..."
-# # # #             yield status, ""
-# # # #             texts1 = [example[dataset1_text_column] for example in ds1]
-# # # #             # Extract texts from Dataset 2
-# # # #             status = "Extracting texts from Dataset 2..."
-# # # #             yield status, ""
-# # # #             texts2 = [example[dataset2_text_column] for example in ds2]
-# # # #             # Compute embeddings for Dataset 1
-# # # #             status = "Computing embeddings for Dataset 1..."
-# # # #             yield status, ""
-# # # #             embedding_matrix1 = compute_embeddings(
-# # # #                 texts1,
-# # # #                 batch_size=64,
-# # # #                 progress=progress,
-# # # #                 desc="Computing embeddings for Dataset 1",
-# # # #             )
-# # # #             # Compute embeddings for Dataset 2
-# # # #             status = "Computing embeddings for Dataset 2..."
-# # # #             yield status, ""
-# # # #             embedding_matrix2 = compute_embeddings(
-# # # #                 texts2,
-# # # #                 batch_size=64,
-# # # #                 progress=progress,
-# # # #                 desc="Computing embeddings for Dataset 2",
-# # # #             )
-# # # #             # Deduplicate across datasets
-# # # #             status = "Deduplicating embeddings across datasets..."
-# # # #             yield status, ""
-# # # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# # # #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
-# # # #             )
-# # # #             num_duplicates = len(duplicate_indices_in_ds2)
-# # # #             num_total_ds2 = len(texts2)
-# # # #             num_unique_ds2 = num_total_ds2 - num_duplicates
-# # # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# # # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-# # # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-# # # #             # Show deduplicated examples
-# # # #             if num_duplicates > 0:
-# # # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-# # # #                 num_examples = min(5, num_duplicates)
-# # # #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-# # # #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
-# # # #                     original_text = texts1[original_idx]
-# # # #                     duplicate_text = texts2[duplicate_idx]
-# # # #                     differences = display_word_differences(original_text, duplicate_text)
-# # # #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-# # # #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-# # # #                     result_text += f"**Differences:**\n{differences}\n"
-# # # #                     result_text += "-" * 50 + "\n\n"
-# # # #             else:
-# # # #                 result_text += "No duplicates found."
-# # # #             # Final status
-# # # #             status = "Deduplication completed."
-# # # #             yield status, result_text
-# # # #     except Exception as e:
-# # # #         yield f"An error occurred: {e}", ""
-# # # #         raise e
-# # # # def deduplicate_across_datasets(
-# # # #     embedding_matrix_1: np.ndarray,
-# # # #     embedding_matrix_2: np.ndarray,
-# # # #     threshold: float,
-# # # #     batch_size: int = 1024,
-# # # #     progress=None
-# # # # ) -> tuple[list[int], dict[int, int]]:
-# # # #     # Building the index from Dataset 1
-# # # #     progress(0, desc="Building search index from Dataset 1...")
-# # # #     reach = Reach(
-# # # #         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
-# # # #     )
-# # # #     duplicate_indices_in_test = []
-# # # #     duplicate_to_original_mapping = {}
-# # # #     # Finding nearest neighbors between datasets
-# # # #     progress(0, desc="Finding nearest neighbors between datasets...")
-# # # #     results = reach.nearest_neighbor_threshold(
-# # # #         embedding_matrix_2,
-# # # #         threshold=threshold,
-# # # #         batch_size=batch_size,
-# # # #         show_progressbar=False,  # Disable internal progress bar
-# # # #     )
-# # # #     total_items = len(embedding_matrix_2)
-# # # #     # Processing duplicates with a progress bar
-# # # #     for i, similar_items in enumerate(
-# # # #         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
-# # # #     ):
-# # # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-# # # #         if similar_indices:
-# # # #             duplicate_indices_in_test.append(i)
-# # # #             duplicate_to_original_mapping[i] = similar_indices[0]
-# # # #     return duplicate_indices_in_test, duplicate_to_original_mapping
-# # # # # Adjust the height of the status_output component using custom CSS
-# # # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
-# # # #     gr.Markdown("# Semantic Deduplication")
-# # # #     deduplication_type = gr.Radio(
-# # # #         choices=["Single dataset", "Cross-dataset"],
-# # # #         label="Deduplication Type",
-# # # #         value="Single dataset",
-# # # #     )
-# # # #     with gr.Row():
-# # # #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-# # # #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-# # # #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# # # #     dataset2_inputs = gr.Column(visible=False)
-# # # #     with dataset2_inputs:
-# # # #         gr.Markdown("### Dataset 2")
-# # # #         with gr.Row():
-# # # #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-# # # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-# # # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# # # #     threshold = gr.Slider(
-# # # #         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
-# # # #     )
-# # # #     compute_button = gr.Button("Compute")
-# # # #     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
-# # # #     status_output = gr.Markdown(elem_id="status_output")
-# # # #     result_output = gr.Markdown()
-# # # #     # Function to update the visibility of dataset2_inputs
-# # # #     def update_visibility(deduplication_type_value):
-# # # #         if deduplication_type_value == "Cross-dataset":
-# # # #             return gr.update(visible=True)
-# # # #         else:
-# # # #             return gr.update(visible=False)
-# # # #     deduplication_type.change(
-# # # #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
-# # # #     )
-# # # #     compute_button.click(
-# # # #         fn=perform_deduplication,
-# # # #         inputs=[
-# # # #             deduplication_type,
-# # # #             dataset1_name,
-# # # #             dataset1_split,
-# # # #             dataset1_text_column,
-# # # #             dataset2_name,
-# # # #             dataset2_split,
-# # # #             dataset2_text_column,
-# # # #             threshold,
-# # # #         ],
-# # # #         outputs=[status_output, result_output],
-# # # #     )
-# # # # demo.launch()

 from reach import Reach
 from difflib import ndiff
+# Load the model
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# Default parameters
+default_dataset_name = "sst2"
+default_dataset_split = "train"
 default_text_column = "sentence"
 default_threshold = 0.9
 def batch_iterable(iterable, batch_size):
+    """Yield successive batches from an iterable."""
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
+def compute_embeddings(texts, batch_size, progress, desc):
+    """Compute embeddings for a list of texts with progress tracking."""
     embeddings = []
     total_batches = (len(texts) + batch_size - 1) // batch_size
     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+        embeddings.append(model.encode(batch_texts, show_progressbar=False))
         progress((i + 1) / total_batches, desc=desc)
     return np.concatenate(embeddings, axis=0)
+def deduplicate_embeddings(
+    embeddings_a: np.ndarray,
+    embeddings_b: np.ndarray = None,
+    threshold: float = 0.9,
     batch_size: int = 1024,
     progress=None
+):
+    """Deduplicate within one dataset or across two datasets."""
+    if embeddings_b is None:
+        reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
+        duplicate_to_original = {}
+        results = reach.nearest_neighbor_threshold(
+            embeddings_a, threshold=threshold, batch_size=batch_size, show_progressbar=False
+        )
+        for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_a))):
+            for sim_idx, _ in similar_items:
+                sim_idx = int(sim_idx)
+                if sim_idx != i and sim_idx not in duplicate_to_original:
+                    duplicate_to_original[sim_idx] = i
+        deduplicated_indices = set(range(len(embeddings_a))) - set(duplicate_to_original.keys())
+        return deduplicated_indices, duplicate_to_original
+    else:
+        reach = Reach(vectors=embeddings_a, items=[str(i) for i in range(len(embeddings_a))])
+        duplicate_indices_in_b = []
+        duplicate_to_original = {}
+        results = reach.nearest_neighbor_threshold(
+            embeddings_b, threshold=threshold, batch_size=batch_size, show_progressbar=False
+        )
+        for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embeddings_b))):
+            if similar_items:
+                duplicate_indices_in_b.append(i)
+                duplicate_to_original[i] = int(similar_items[0][0])
+        return duplicate_indices_in_b, duplicate_to_original
 def display_word_differences(x: str, y: str) -> str:
+    """Display differences between two texts."""
     diff = ndiff(x.split(), y.split())
+    return " ".join(word for word in diff if word.startswith(("+", "-")))
+def load_dataset_texts(dataset_name, dataset_split, text_column):
+    """Load texts from a specified dataset."""
+    ds = load_dataset(dataset_name, split=dataset_split)
+    return [example[text_column] for example in ds]
 def perform_deduplication(
     deduplication_type,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         threshold = float(threshold)
+        # Load and process Dataset 1
+        yield "Loading Dataset 1...", ""
+        texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
+        yield "Computing embeddings for Dataset 1...", ""
+        embeddings1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Dataset 1 embeddings")
         if deduplication_type == "Single dataset":
+            # Deduplicate within Dataset 1
+            yield "Deduplicating within Dataset 1...", ""
+            deduplicated_indices, duplicate_mapping = deduplicate_embeddings(
+                embeddings1, threshold=threshold, progress=progress
             )
+            num_duplicates = len(duplicate_mapping)
+            result_text = (
+                f"**Total documents:** {len(texts1)}\n"
+                f"**Duplicates found:** {num_duplicates}\n"
+                f"**Unique documents after deduplication:** {len(deduplicated_indices)}\n\n"
             )
             if num_duplicates > 0:
+                result_text += "**Sample duplicates:**\n\n"
+                for dup_idx, orig_idx in list(duplicate_mapping.items())[:5]:
+                    orig_text = texts1[orig_idx]
+                    dup_text = texts1[dup_idx]
+                    differences = display_word_differences(orig_text, dup_text)
+                    result_text += (
+                        f"**Original:**\n{orig_text}\n\n"
+                        f"**Duplicate:**\n{dup_text}\n\n"
+                        f"**Differences:**\n{differences}\n"
+                        + "-" * 50 + "\n\n"
+                    )
             else:
                 result_text += "No duplicates found."
+            yield "Deduplication completed.", result_text
+        else:
+            # Load and process Dataset 2
+            yield "Loading Dataset 2...", ""
+            texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
+            yield "Computing embeddings for Dataset 2...", ""
+            embeddings2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Dataset 2 embeddings")
+            # Deduplicate Dataset 2 against Dataset 1
+            yield "Deduplicating Dataset 2 against Dataset 1...", ""
+            duplicate_indices, duplicate_mapping = deduplicate_embeddings(
+                embeddings1, embeddings_b=embeddings2, threshold=threshold, progress=progress
             )
+            num_duplicates = len(duplicate_indices)
+            result_text = (
+                f"**Total documents in {dataset2_name}/{dataset2_split}:** {len(texts2)}\n"
+                f"**Duplicates found in Dataset 2:** {num_duplicates}\n"
+                f"**Unique documents after deduplication:** {len(texts2) - num_duplicates}\n\n"
             )
             if num_duplicates > 0:
+                result_text += "**Sample duplicates from Dataset 2:**\n\n"
+                for idx in duplicate_indices[:5]:
+                    orig_text = texts1[duplicate_mapping[idx]]
+                    dup_text = texts2[idx]
+                    differences = display_word_differences(orig_text, dup_text)
+                    result_text += (
+                        f"**Original (Dataset 1):**\n{orig_text}\n\n"
+                        f"**Duplicate (Dataset 2):**\n{dup_text}\n\n"
+                        f"**Differences:**\n{differences}\n"
+                        + "-" * 50 + "\n\n"
+                    )
             else:
                 result_text += "No duplicates found."
+            yield "Deduplication completed.", result_text
     except Exception as e:
         yield f"An error occurred: {e}", ""
         raise e
 with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
     )
     with gr.Row():
+        dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
+        dataset1_split = gr.Textbox(value=default_dataset_split, label="Dataset 1 Split")
         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     dataset2_inputs = gr.Column(visible=False)
     with dataset2_inputs:
         gr.Markdown("### Dataset 2")
         with gr.Row():
+            dataset2_name = gr.Textbox(value=default_dataset_name, label="Dataset 2 Name")
+            dataset2_split = gr.Textbox(value=default_dataset_split, label="Dataset 2 Split")
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+    threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
     compute_button = gr.Button("Compute")
     status_output = gr.Markdown(elem_id="status_output")
     result_output = gr.Markdown()
+    def update_visibility(choice):
+        return gr.update(visible=choice == "Cross-dataset")
+    deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=dataset2_inputs)
     compute_button.click(
         fn=perform_deduplication,
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 # from model2vec import StaticModel
 # from reach import Reach
 # from difflib import ndiff
 # # Load the model at startup
 # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # Default dataset parameters
 # default_dataset1_name = "sst2"
 # default_dataset1_split = "train"
 # default_dataset2_name = "sst2"
 # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
 #     embeddings = []
+#     total_batches = (len(texts) + batch_size - 1) // batch_size
+#     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+#         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
 #         embeddings.append(batch_embeddings)
+#         progress((i + 1) / total_batches, desc=desc)
 #     return np.concatenate(embeddings, axis=0)
+# def deduplicate(
+#     embedding_matrix: np.ndarray,
+#     threshold: float,
+#     batch_size: int = 1024,
+#     progress=None
+# ) -> tuple[np.ndarray, dict[int, int]]:
+#     # Building the index
+#     progress(0, desc="Building search index...")
+#     reach = Reach(
+#         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
+#     )
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
+#     # Finding nearest neighbors
+#     progress(0, desc="Finding nearest neighbors...")
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=False,  # Disable internal progress bar
 #     )
+#     # Processing duplicates with a progress bar
 #     total_items = len(embedding_matrix)
+#     for i, similar_items in enumerate(
+#         progress.tqdm(results, desc="Processing duplicates", total=total_items)
+#     ):
 #         if i not in deduplicated_indices:
 #             continue
 #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
 # def display_word_differences(x: str, y: str) -> str:
 #     diff = ndiff(x.split(), y.split())
+#     return " ".join([word for word in diff if word.startswith(("+", "-"))])
 # def perform_deduplication(
 #     deduplication_type,
 #     dataset2_split="",
 #     dataset2_text_column="",
 #     threshold=default_threshold,
+#     progress=gr.Progress(track_tqdm=True),
 # ):
 #     try:
+#         # Convert threshold to float
 #         threshold = float(threshold)
+#         # Initialize status message
+#         status = ""
 #         if deduplication_type == "Single dataset":
+#             # Load Dataset 1
+#             status = "Loading Dataset 1..."
+#             yield status, ""
+#             if (
+#                 dataset1_name == default_dataset1_name
+#                 and dataset1_split == default_dataset1_split
+#             ):
+#                 ds = ds_default1
+#             else:
+#                 ds = load_dataset(dataset1_name, split=dataset1_split)
+#             # Extract texts
+#             status = "Extracting texts from Dataset 1..."
+#             yield status, ""
+#             texts = [example[dataset1_text_column] for example in ds]
+#             # Compute embeddings
+#             status = "Computing embeddings for Dataset 1..."
+#             yield status, ""
+#             embedding_matrix = compute_embeddings(
+#                 texts,
+#                 batch_size=64,
+#                 progress=progress,
+#                 desc="Computing embeddings for Dataset 1",
+#             )
+#             # Deduplicate
+#             status = "Deduplicating embeddings..."
+#             yield status, ""
+#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+#                 embedding_matrix, threshold, progress=progress
+#             )
+#             # Prepare the results
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text = f"**Total documents:** {num_total}\n"
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+#             result_text += (
+#                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+#             )
+#             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                 num_examples = min(5, num_duplicates)
 #             else:
 #                 result_text += "No duplicates found."
+#             # Final status
+#             status = "Deduplication completed."
+#             yield status, result_text
 #         elif deduplication_type == "Cross-dataset":
+#             # Similar code for cross-dataset deduplication
+#             # Load Dataset 1
+#             status = "Loading Dataset 1..."
+#             yield status, ""
+#             if (
+#                 dataset1_name == default_dataset1_name
+#                 and dataset1_split == default_dataset1_split
+#             ):
+#                 ds1 = ds_default1
+#             else:
+#                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+#             # Load Dataset 2
+#             status = "Loading Dataset 2..."
+#             yield status, ""
+#             if (
+#                 dataset2_name == default_dataset2_name
+#                 and dataset2_split == default_dataset2_split
+#             ):
+#                 ds2 = ds_default2
+#             else:
+#                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+#             # Extract texts from Dataset 1
+#             status = "Extracting texts from Dataset 1..."
+#             yield status, ""
 #             texts1 = [example[dataset1_text_column] for example in ds1]
+#             # Extract texts from Dataset 2
+#             status = "Extracting texts from Dataset 2..."
+#             yield status, ""
+#             texts2 = [example[dataset2_text_column] for example in ds2]
+#             # Compute embeddings for Dataset 1
+#             status = "Computing embeddings for Dataset 1..."
+#             yield status, ""
+#             embedding_matrix1 = compute_embeddings(
+#                 texts1,
+#                 batch_size=64,
+#                 progress=progress,
+#                 desc="Computing embeddings for Dataset 1",
+#             )
+#             # Compute embeddings for Dataset 2
+#             status = "Computing embeddings for Dataset 2..."
+#             yield status, ""
+#             embedding_matrix2 = compute_embeddings(
+#                 texts2,
+#                 batch_size=64,
+#                 progress=progress,
+#                 desc="Computing embeddings for Dataset 2",
+#             )
+#             # Deduplicate across datasets
+#             status = "Deduplicating embeddings across datasets..."
+#             yield status, ""
+#             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+#                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
+#             )
 #             num_duplicates = len(duplicate_indices_in_ds2)
 #             num_total_ds2 = len(texts2)
 #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+#             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 #                 num_examples = min(5, num_duplicates)
 #             else:
 #                 result_text += "No duplicates found."
+#             # Final status
+#             status = "Deduplication completed."
+#             yield status, result_text
 #     except Exception as e:
 #         yield f"An error occurred: {e}", ""
+#         raise e
+# def deduplicate_across_datasets(
+#     embedding_matrix_1: np.ndarray,
+#     embedding_matrix_2: np.ndarray,
+#     threshold: float,
+#     batch_size: int = 1024,
+#     progress=None
+# ) -> tuple[list[int], dict[int, int]]:
+#     # Building the index from Dataset 1
+#     progress(0, desc="Building search index from Dataset 1...")
+#     reach = Reach(
+#         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
+#     )
+#     duplicate_indices_in_test = []
+#     duplicate_to_original_mapping = {}
+#     # Finding nearest neighbors between datasets
+#     progress(0, desc="Finding nearest neighbors between datasets...")
+#     results = reach.nearest_neighbor_threshold(
+#         embedding_matrix_2,
+#         threshold=threshold,
+#         batch_size=batch_size,
+#         show_progressbar=False,  # Disable internal progress bar
+#     )
+#     total_items = len(embedding_matrix_2)
+#     # Processing duplicates with a progress bar
+#     for i, similar_items in enumerate(
+#         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
+#     ):
+#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+#         if similar_indices:
+#             duplicate_indices_in_test.append(i)
+#             duplicate_to_original_mapping[i] = similar_indices[0]
+#     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # Adjust the height of the status_output component using custom CSS
+# with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     deduplication_type = gr.Radio(
 #         choices=["Single dataset", "Cross-dataset"],
 #         label="Deduplication Type",
+#         value="Single dataset",
 #     )
 #     with gr.Row():
 #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 #     threshold = gr.Slider(
+#         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
 #     )
 #     compute_button = gr.Button("Compute")
+#     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
 #     status_output = gr.Markdown(elem_id="status_output")
+#     result_output = gr.Markdown()
+#     # Function to update the visibility of dataset2_inputs
 #     def update_visibility(deduplication_type_value):
 #         if deduplication_type_value == "Cross-dataset":
 #             return gr.update(visible=True)
 #             return gr.update(visible=False)
 #     deduplication_type.change(
+#         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
 #     )
 #     compute_button.click(
 #             dataset2_name,
 #             dataset2_split,
 #             dataset2_text_column,
+#             threshold,
 #         ],
+#         outputs=[status_output, result_output],
 #     )
 # demo.launch()