Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 26

Commit

0827f9d

1 Parent(s): a5e8d12

Add comments to text module

Browse files

Files changed (12) hide show

application.py +2 -2
gpt_test.py +1 -1
src/application/config.py +84 -0
src/application/content_detection.py +12 -14
src/application/text/entity.py +238 -99
src/application/text/helper.py +96 -16
src/application/text/highlight_text.py +0 -202
src/application/text/model_detection.py +69 -54
src/application/text/preprocessing.py +31 -10
src/application/text/search.py +84 -42
src/application/text/search_detection.py +85 -183
test.py +3 -74

application.py CHANGED Viewed

@@ -125,7 +125,7 @@ FOR GOVERNOR<br>
 - Each highlighted pair (marked with a number) shows the key differences
 between the input text and the source.
         """
-        table = f"""
 <h5>Comparison between input news and source news:</h5>
     <table border="1" style="width:100%; text-align:left;">
     <col style="width: 170px;">
@@ -144,7 +144,7 @@ between the input text and the source.
             <tr>
                 <td style="border-bottom: 1px solid transparent";>TBD</td>
                 <td style="border-bottom: 1px solid transparent";>TBD</td>
-                <td rowspan="2"> <img src="https://huggingface.co/spaces/pmkhanh7890/news_verification/resolve/main/examples/example_image_input.jpg" alt="A picture of a cat."></td>
                 <td rowspan="2">TBD</td>
             </tr>
             <tr>

 - Each highlighted pair (marked with a number) shows the key differences
 between the input text and the source.
         """
+        table = """
 <h5>Comparison between input news and source news:</h5>
     <table border="1" style="width:100%; text-align:left;">
     <col style="width: 170px;">
             <tr>
                 <td style="border-bottom: 1px solid transparent";>TBD</td>
                 <td style="border-bottom: 1px solid transparent";>TBD</td>
+                <td rowspan="2">TBD</td>
                 <td rowspan="2">TBD</td>
             </tr>
             <tr>

gpt_test.py CHANGED Viewed

@@ -96,7 +96,7 @@ azure_client = AzureOpenAI(
     api_version="2024-05-01-preview",
 )
-deplopment_name = "gpt-4o" # or "gpt-4o-mini"  # "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 Paraphrase the following news, only output the paraphrased text:

     api_version="2024-05-01-preview",
 )
+deplopment_name = "gpt-4o"  # or "gpt-4o-mini"  # "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 Paraphrase the following news, only output the paraphrased text:

src/application/config.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Download necessary NLTK data files
+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
+import os
+import nltk
+import openai
+import torch
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer
+# Load environment variables
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
+AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
+# GPT Model
+GPT_ENTITY_MODEL = "o1-mini"  # "gpt-4o-mini" or "o1-mini"
+GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
+AZUREOPENAI_CLIENT = openai.AzureOpenAI(
+    api_version=AZURE_OPENAI_API_VERSION,  # AZURE_OPENAI_API_VERSION,
+    api_key=AZURE_OPENAI_API_KEY,
+    azure_endpoint=AZURE_OPENAI_ENDPOINT,
+)
+# Download the resources
+nltk.download("punkt", quiet=True)  # Sentence tokenization
+nltk.download("punkt_tab", quiet=True)  # Tokenization with tab-separated data
+nltk.download("stopwords", quiet=True)  # A list of stop words
+STOPWORDS_LANG = "english"
+# Load PARAPHASE_MODEL
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PARAPHRASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
+PARAPHRASE_MODEL.to(DEVICE)
+# Model to detect AI-generated text
+AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
+# Thresholds
+PARAPHRASE_THRESHOLD_HUMAN = 0.963
+PARAPHRASE_THRESHOLD_MACHINE = 0.8
+PARAPHRASE_THRESHOLD = 0.8
+MIN_SAME_SENTENCE_LEN = 6
+MIN_PHRASE_SENTENCE_LEN = 10
+MIN_RATIO_PARAPHRASE_NUM = 0.5
+MAX_CHAR_SIZE = 30000
+# Number of top URLs per search
+TOP_URLS_PER_SEARCH = 3
+# Search parameters
+GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
+TOP_SEARCH_RESUTLS = 10
+CHUNK_SIZE = 32  # words
+NUM_CHUNKS = 3  # number of chunks to search
+NUM_FREQUENT_WORDS = 32  # number of top words to return
+NUM_KEYWORDS = 5  # number of keywords to return
+# Labels
+MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
+HUMAN = "HUMAN"
+MACHINE = "MACHINE"
+UNKNOWN = "UNKNOWN"
+PARAPHRASE = "PARAPHRASE"
+NON_PARAPHRASE = "NON_PARAPHRASE"
+# Entity color
+"""
+factor > 1: Lightens the color.
+factor = 1: Leaves the color unchanged.
+factor < 1: Darkens the color.
+factor = 0: Black.
+"""
+ENTITY_LIGHTEN_COLOR = 2.2
+ENTITY_DARKEN_COLOR = 0.7
+ENTITY_SATURATION = 0.65  # Saturation: color's intensity (vividness).
+ENTITY_BRIGHTNESS = 0.75  # color's brightness.

src/application/content_detection.py CHANGED Viewed

@@ -16,7 +16,7 @@ from src.application.text.model_detection import (
     detect_text_by_ai_model,
     predict_generation_model,
 )
-from src.application.text.preprocessing import split_into_paragraphs, split_into_sentences
 from src.application.text.search_detection import (
     PARAPHRASE_THRESHOLD_MACHINE,
     find_sentence_source,
@@ -112,7 +112,7 @@ class NewsVerification:
                     na=False,
                 )
             ]
             if len(machine_label) > 0:
                 label = " ".join(machine_label["label"].tolist())
                 self.text_prediction_label[0] = label
@@ -147,7 +147,7 @@ class NewsVerification:
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
-        #input_sentences = split_into_sentences(self.news_text)
         input_paragraphs = split_into_paragraphs(self.news_text)
         # Setup df for input_sentences
@@ -402,8 +402,6 @@ class NewsVerification:
             if span_row == 1:
                 last_url_row = True
-            # end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
             formatted_row = self.format_text_fact_checker_row(
                 row,
                 first_url_row,
@@ -873,11 +871,11 @@ class NewsVerification:
             start_end = list(range(start, end + 1, 1))
             start_end = list(set(start_end) - set(ignore_indices))
-            #new_start, new_end = self.extract_sequences(start_end)
             new_start, new_end = self.extract_new_startend(
-                start,
-                end,
-                ignore_indices
             )
             filtered_starts.extend(new_start)
             filtered_ends.extend(new_end)
@@ -885,7 +883,7 @@ class NewsVerification:
         return filtered_starts, filtered_ends
     def extract_new_startend(self, start, end, ignore_indices):
-        # sort a set of ignore_indices
         indexes = list(set(ignore_indices))
         indexes.sort()
@@ -896,22 +894,22 @@ class NewsVerification:
             new_starts.append(start)
             new_ends.append(end)
             return new_starts, new_ends
         for index in indexes:
             if index < start:
                 continue
             elif index >= end:
                 continue
             new_starts.append(new_start)
             new_ends.append(index)
             new_start = index + 1
         new_starts.append(new_start)
         new_ends.append(end)
-        return new_starts, new_ends
     def extract_sequences(self, numbers):
         if len(numbers) == 1:

     detect_text_by_ai_model,
     predict_generation_model,
 )
+from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search_detection import (
     PARAPHRASE_THRESHOLD_MACHINE,
     find_sentence_source,
                     na=False,
                 )
             ]
             if len(machine_label) > 0:
                 label = " ".join(machine_label["label"].tolist())
                 self.text_prediction_label[0] = label
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
+        # input_sentences = split_into_sentences(self.news_text)
         input_paragraphs = split_into_paragraphs(self.news_text)
         # Setup df for input_sentences
             if span_row == 1:
                 last_url_row = True
             formatted_row = self.format_text_fact_checker_row(
                 row,
                 first_url_row,
             start_end = list(range(start, end + 1, 1))
             start_end = list(set(start_end) - set(ignore_indices))
+            # new_start, new_end = self.extract_sequences(start_end)
             new_start, new_end = self.extract_new_startend(
+                start,
+                end,
+                ignore_indices,
             )
             filtered_starts.extend(new_start)
             filtered_ends.extend(new_end)
         return filtered_starts, filtered_ends
     def extract_new_startend(self, start, end, ignore_indices):
+        # sort a set of ignore_indices
         indexes = list(set(ignore_indices))
         indexes.sort()
             new_starts.append(start)
             new_ends.append(end)
             return new_starts, new_ends
         for index in indexes:
             if index < start:
                 continue
             elif index >= end:
                 continue
             new_starts.append(new_start)
             new_ends.append(index)
             new_start = index + 1
         new_starts.append(new_start)
         new_ends.append(end)
+        return new_starts, new_ends
     def extract_sequences(self, numbers):
         if len(numbers) == 1:

src/application/text/entity.py CHANGED Viewed

@@ -1,34 +1,49 @@
 import colorsys
 import json
-import os
 import re
 import gradio as gr
 import openai
-from dotenv import load_dotenv
 from transformers import pipeline
-ner_pipeline = pipeline("ner")
-load_dotenv()
-AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
-AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
-AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
-client = openai.AzureOpenAI(
-    api_version="2024-05-01-preview",  # AZURE_OPENAI_API_VERSION,
-    api_key=AZURE_OPENAI_API_KEY,
-    azure_endpoint=AZURE_OPENAI_ENDPOINT,
 )
 def extract_entities_gpt(
     original_text,
     compared_text,
-    text_generation_model="o1-mini",
-):
-    # "gpt-4o-mini" or "o1-mini"
-    # Generate text using the selected models
     prompt = f"""
 Compare the ORIGINAL TEXT and the COMPARED TEXT.
 Find entity pairs with significantly different meanings after paraphrasing.
@@ -60,14 +75,15 @@ If there are no entities that satisfy above condition, output empty list "[]".
 {compared_text}
     """
-    # Generate text using the text generation model
     # Generate text using the selected model
     try:
-        response = client.chat.completions.create(
             model=text_generation_model,
             messages=[{"role": "user", "content": prompt}],
         )
         res = response.choices[0].message.content
     except openai.OpenAIError as e:
@@ -77,15 +93,27 @@ If there are no entities that satisfy above condition, output empty list "[]".
     return res
-def read_json(json_string) -> list[list[str]]:
     try:
         entities = json.loads(json_string)
         # Remove duplicates pair of entities
         unique_entities = []
         for inner_list in entities:
             if inner_list not in unique_entities:
                 unique_entities.append(inner_list)
         return unique_entities
     except json.JSONDecodeError as e:
@@ -93,66 +121,94 @@ def read_json(json_string) -> list[list[str]]:
         return []
-def lighten_color(hex_color, factor=1.8):
-    """Lightens a HEX color by increasing its brightness in HSV space."""
-    hex_color = hex_color.lstrip("#")
-    r, g, b = (
-        int(hex_color[0:2], 16),
-        int(hex_color[2:4], 16),
-        int(hex_color[4:6], 16),
-    )
-    # Convert to HSV
-    h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
-    v = min(1.0, v * factor)  # Increase brightness
-    # Convert back to HEX
-    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
-    return f"#{r:02x}{g:02x}{b:02x}"
-def darken_color(hex_color, factor=0.7):
-    """Darkens a hex color by reducing its brightness in the HSV space."""
     hex_color = hex_color.lstrip("#")
     r, g, b = (
-        int(hex_color[0:2], 16),
-        int(hex_color[2:4], 16),
-        int(hex_color[4:6], 16),
     )
-    # Convert to HSV to adjust brightness
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
-    v = max(0, v * factor)  # Reduce brightness
-    # Convert back to HEX
     r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
     return f"#{r:02x}{g:02x}{b:02x}"
-def generate_color(index, total_colors=20):
-    """Generates a unique, evenly spaced color for each index using HSL."""
     hue = index / total_colors  # Spread hues in range [0,1]
-    saturation = 0.65  # Keep colors vivid
-    lightness = 0.75  # Balanced brightness
     # Convert HSL to RGB
-    r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
     r, g, b = int(r * 255), int(g * 255), int(b * 255)
-    return f"#{r:02x}{g:02x}{b:02x}"  # Convert to hex
-def assign_colors_to_entities(entities):
     total_colors = len(entities)
-    # Assign colors to entities
     entities_colors = []
     for index, entity in enumerate(entities):
-        color = generate_color(index, total_colors)
-        # append color and index to entities_colors
         entities_colors.append(
             {"color": color, "input": entity[0], "source": entity[1]},
         )
@@ -160,43 +216,83 @@ def assign_colors_to_entities(entities):
     return entities_colors
-def highlight_entities(text1, text2):
     if text1 is None or text2 is None:
         return None
     entities_text = extract_entities_gpt(text1, text2)
-    # Clean up entities: remove wrapping characters
     entities_text = entities_text.replace("```json", "").replace("```", "")
     entities = read_json(entities_text)
     if len(entities) == 0:
         return None
-    # Assign colors to entities
     entities_with_colors = assign_colors_to_entities(entities)
     return entities_with_colors
-def apply_highlight(text, entities_with_colors, key="input", count=0):
     if entities_with_colors is None:
         return text, []
     all_starts = []
     all_ends = []
     highlighted_text = ""
     temp_text = text
     for index, entity in enumerate(entities_with_colors):
         highlighted_text = ""
-        # find a list of starts and ends of entity in text:
-        # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
-        # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
         starts = []
         ends = []
-        # "\b" is for bound a word
         for m in re.finditer(
             r"\b" + re.escape(entity[key]) + r"\b",
             temp_text,
         ):
@@ -206,78 +302,116 @@ def apply_highlight(text, entities_with_colors, key="input", count=0):
         all_starts.extend(starts)
         all_ends.extend(ends)
         color = entities_with_colors[index]["color"]
-        entity_color = lighten_color(
             color,
-            factor=2.2,
-        )  # Lightened color for background text
-        label_color = darken_color(
             entity_color,
-            factor=0.7,
-        )  # Darker color for background label (index)
-        # Apply highlighting to each entity
         prev_end = 0
         for start, end in zip(starts, ends):
-            # Append non-highlighted text
             highlighted_text += temp_text[prev_end:start]
-            # Style the index as a label
             index_label = (
                 f'<span_style="background-color:{label_color};color:white;'
                 f"padding:1px_4px;border-radius:4px;font-size:12px;"
                 f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>'  # noqa: E501
             )
-            # Append highlighted text with index label
             highlighted_text += (
                 f'<span_style="background-color:{entity_color};color:black;'
                 f'border-radius:3px;font-size:14px;display:inline-block;">'
                 f"{index_label}{temp_text[start:end]}</span>"
             )
             prev_end = end
         highlighted_text += temp_text[prev_end:]
         temp_text = highlighted_text
     if highlighted_text == "":
         return text, []
     highlight_idx_list = get_index_list(highlighted_text)
     return highlighted_text, highlight_idx_list
-def get_index_list(highlighted_text):
     """
-    Generates a list of indices between corresponding start and end indices.
     Args:
-        starts: A list of starting indices.
-        ends: A list of ending indices.  Must be the same length as starts.
     Returns:
-        A list containing all indices within the specified ranges.
-        Returns an empty list if the input is invalid (e.g., different lengths,
-        end < start, etc.).
     """
     highlighted_index = []
     words = highlighted_text.split()
     for index, word in enumerate(words):
         if word.startswith("<span_style"):
             start_index = index
         if word.endswith("</span>"):
             end_index = index
-            highlighted_index.extend(list(range(start_index, end_index + 1)))
     return highlighted_index
-def extract_entities(text):
     output = ner_pipeline(text)
     words = extract_words(output)
     words = combine_subwords(words)
-    # extract word in each entity and assign to a list of entities,
-    # connect words if there is no space between them
     entities = []
     for entity in words:
         if entity not in entities:
@@ -286,15 +420,17 @@ def extract_entities(text):
     return entities
-def extract_words(entities):
     """
     Extracts the words from a list of entities.
     Args:
-    entities: A list of entities.
     Returns:
-    A list of words extracted from the entities.
     """
     words = []
     for entity in entities:
@@ -307,24 +443,26 @@ def combine_subwords(word_list):
     Combines subwords (indicated by "##") with the preceding word in a list.
     Args:
-    word_list: A list of words, where subwords are prefixed with "##".
     Returns:
-    A new list with subwords combined with their preceding words.
     """
     result = []
     i = 0
     while i < len(word_list):
         if word_list[i].startswith("##"):
-            result[-1] += word_list[i][
-                2:
-            ]  # Remove "##" and append to the previous word
-        elif (
-            i < len(word_list) - 2 and word_list[i + 1] == "-"
-        ):  # Combine hyphenated words
             result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
-            i += 2  # Skip the next two words
         else:
             result.append(word_list[i])
         i += 1
     return result
@@ -360,6 +498,7 @@ is losing territory in the east. Zelensky praised Japan's commitment
 on Thursday, amid wider concerns that the next US President, who is
 set to take office on Monday, could potentially reduce aid.
     """
 if __name__ == "__main__":
     with gr.Blocks() as demo:
         gr.Markdown("### Highlight Matching Parts Between Two Texts")

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
 import colorsys
 import json
 import re
 import gradio as gr
 import openai
 from transformers import pipeline
+from src.application.config import (
+    AZUREOPENAI_CLIENT,
+    ENTITY_BRIGHTNESS,
+    ENTITY_DARKEN_COLOR,
+    ENTITY_LIGHTEN_COLOR,
+    ENTITY_SATURATION,
+    GPT_ENTITY_MODEL,
 )
+ner_pipeline = pipeline("ner")
 def extract_entities_gpt(
     original_text,
     compared_text,
+    text_generation_model=GPT_ENTITY_MODEL,
+) -> str:
+    """
+    Extracts entity pairs with significantly different meanings between
+        two texts using a GPT model.
+    Args:
+        original_text (str): The original text.
+        compared_text (str): The paraphrased or compared text.
+        text_generation_model (str, optional): The GPT model
+            to use for entity extraction.
+    Returns:
+        str: The JSON-like string containing the extracted entity pairs,
+            or an empty string if an error occurs.
+    """
+    # Construct the prompt for the GPT model.
     prompt = f"""
 Compare the ORIGINAL TEXT and the COMPARED TEXT.
 Find entity pairs with significantly different meanings after paraphrasing.
 {compared_text}
     """
     # Generate text using the selected model
     try:
+        # Send the prompt to the GPT model and get the response.
+        response = AZUREOPENAI_CLIENT.chat.completions.create(
             model=text_generation_model,
             messages=[{"role": "user", "content": prompt}],
         )
+        # Extract the generated content from the response.
         res = response.choices[0].message.content
     except openai.OpenAIError as e:
     return res
+def read_json(json_string: str) -> list[list[str, str]]:
+    """
+    Parses a JSON string and returns a list of unique entity pairs.
+    Args:
+        json_string (str): The JSON string to parse.
+    Returns:
+        List[List[str, str]]: A list of unique entity pairs,
+            or an empty list if parsing fails.
+    """
     try:
+        # Attempt to parse the JSON string into a Python object
         entities = json.loads(json_string)
         # Remove duplicates pair of entities
         unique_entities = []
         for inner_list in entities:
+            # Check if the current entity pair is already existed.
             if inner_list not in unique_entities:
                 unique_entities.append(inner_list)
         return unique_entities
     except json.JSONDecodeError as e:
         return []
+def set_color_brightness(
+    hex_color: str,
+    brightness_factor: float = ENTITY_LIGHTEN_COLOR,
+) -> str:
+    """
+    Lightens a HEX color by increasing its brightness in HSV space.
+    Args:
+        hex_color (str): The HEX color code (e.g., "#RRGGBB").
+        factor (float, optional): The factor by which to increase brightness.
+    Returns:
+        str: The lightened HEX color code.
+    """
+    # Remove the '#' prefix if present.
     hex_color = hex_color.lstrip("#")
+    # Convert the HEX color to RGB (red, green, blue) integers.
     r, g, b = (
+        int(hex_color[0:2], 16),  # Red component
+        int(hex_color[2:4], 16),  # Green component
+        int(hex_color[4:6], 16),  # Blue component
     )
+    # Convert RGB to HSV (hue, saturation, value/brightness)
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
+    # Increase the brightness by the specified factor, but cap it at 1.0.
+    v = min(1.0, v * brightness_factor)
+    # Convert the modified HSV back to RGB
     r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
+    # Convert the RGB values back to a HEX color code.
     return f"#{r:02x}{g:02x}{b:02x}"
+def generate_colors(index: int, total_colors: int = 20) -> str:
+    """
+    Generates a unique, evenly spaced color for each index using HSL.
+    Args:
+        index (int): The index for which to generate a color.
+        total_colors (int, optional): The total number of colors to
+            distribute evenly. Defaults to 20.
+    Returns:
+        str: A HEX color code representing the generated color.
+    """
+    # Calculate the hue value based on the index and total number of colors.
+    # This ensures even distribution of hues across the color spectrum.
     hue = index / total_colors  # Spread hues in range [0,1]
     # Convert HSL to RGB
+    r, g, b = colorsys.hls_to_rgb(hue, ENTITY_SATURATION, ENTITY_BRIGHTNESS)
+    # Scale the RGB values from [0, 1] to [0, 255]
     r, g, b = int(r * 255), int(g * 255), int(b * 255)
+    # Convert to hex
+    return f"#{r:02x}{g:02x}{b:02x}"
+def assign_colors_to_entities(entities: list) -> list[dict]:
+    """
+    Assigns unique colors to each entity pair in a list.
+    Args:
+        entities (list): A list of entity pairs,
+            where each pair is a list of two strings.
+            Example: [["entity1_original", "entity1_compared"]]
+    Returns:
+        list: A list of dictionaries,
+            where each dictionary contains
+                - "color": the color of entity pair.
+                - "input": the original entity string.
+                - "source": the compared entity string.
+    """
+    # Number of colors needed.
     total_colors = len(entities)
+    # Assign colors to entities using their index.
     entities_colors = []
     for index, entity in enumerate(entities):
+        color = generate_colors(index, total_colors)
+        # Append color and index to entities_colors
         entities_colors.append(
             {"color": color, "input": entity[0], "source": entity[1]},
         )
     return entities_colors
+def highlight_entities(text1: str, text2: str) -> list[dict]:
+    """
+    Highlights entities with significant differences between
+        two texts by assigning them unique colors.
+    Args:
+        text1 (str): input text.
+        text2 (str): source text.
+    Returns:
+        list: A list of dictionaries, where each dictionary
+            contains the highlighted entity information (color, input, source)
+            or None if no significant entities are found or an error occurs.
+    """
     if text1 is None or text2 is None:
         return None
+    # Extract entities with significant differences using a GPT model.
     entities_text = extract_entities_gpt(text1, text2)
+    # Clean up the extracted entities string by removing wrapping characters.
     entities_text = entities_text.replace("```json", "").replace("```", "")
+    # Parse the cleaned entities string into a Python list of entity pairs.
     entities = read_json(entities_text)
+    # If no significant entities are found, return None.
     if len(entities) == 0:
         return None
+    # Assign unique colors to the extracted entities.
     entities_with_colors = assign_colors_to_entities(entities)
     return entities_with_colors
+def apply_highlight(
+    text: str,
+    entities_with_colors: list[dict],
+    key: str = "input",
+    count: int = 0,
+) -> tuple[str, list[int]]:
+    """
+    Applies highlighting to specified entities within a text,
+        assigning them unique colors and index labels.
+    Args:
+        text (str): The text to highlight.
+        entities_with_colors (list): A list of dictionaries,
+            where each dictionary represents an entity and its color.
+        key (str, optional): The key in the entity dictionary that
+            contains the entity text to highlight.
+        count (int, optional): An offset to add to the index labels.
+    Returns:
+        tuple:
+            - A tuple containing the highlighted text (str).
+            - A list of index positions (list).
+    """
     if entities_with_colors is None:
         return text, []
+    # Start & end indices of highlighted entities.
     all_starts = []
     all_ends = []
     highlighted_text = ""
     temp_text = text
+    # Apply highlighting to each entity.
     for index, entity in enumerate(entities_with_colors):
         highlighted_text = ""
         starts = []
         ends = []
         for m in re.finditer(
+            # Word boundaries (\b) and escape special characters
             r"\b" + re.escape(entity[key]) + r"\b",
             temp_text,
         ):
         all_starts.extend(starts)
         all_ends.extend(ends)
+        # Get the colors for each occurrence of the entity.
         color = entities_with_colors[index]["color"]
+        # Lightened color for background text
+        entity_color = set_color_brightness(
             color,
+            brightness_factor=ENTITY_LIGHTEN_COLOR,
+        )
+        # Darker color for background label (index)
+        label_color = set_color_brightness(
             entity_color,
+            brightness_factor=ENTITY_DARKEN_COLOR,
+        )
+        # Apply highlighting to each occurrence of the entity.
         prev_end = 0
         for start, end in zip(starts, ends):
+            # Non-highlighted text before the entity.
             highlighted_text += temp_text[prev_end:start]
+            # Create the index label with the specified color and style.
             index_label = (
                 f'<span_style="background-color:{label_color};color:white;'
                 f"padding:1px_4px;border-radius:4px;font-size:12px;"
                 f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>'  # noqa: E501
             )
+            # Highlighted entity with the specified color and style.
             highlighted_text += (
                 f'<span_style="background-color:{entity_color};color:black;'
                 f'border-radius:3px;font-size:14px;display:inline-block;">'
                 f"{index_label}{temp_text[start:end]}</span>"
             )
             prev_end = end
+        # Append any remaining text after the last entity.
         highlighted_text += temp_text[prev_end:]
+        # Update the temporary text with the highlighted text.
         temp_text = highlighted_text
     if highlighted_text == "":
         return text, []
+    # Get the index list of the highlighted text.
     highlight_idx_list = get_index_list(highlighted_text)
     return highlighted_text, highlight_idx_list
+def get_index_list(highlighted_text: str) -> list[int]:
     """
+    Generates a list of indices of highlighted words within a text.
     Args:
+        highlighted_text (str): The text containing highlighted words
+            wrapped in HTML-like span tags.
     Returns:
+        list: A list of indices corresponding to the highlighted words.
+              An empty list if no highlighted words are found.
     """
     highlighted_index = []
+    start_index = None
+    end_index = None
     words = highlighted_text.split()
     for index, word in enumerate(words):
+        # Check if the word starts with a highlighted word.
         if word.startswith("<span_style"):
             start_index = index
+        # Check if the word ends with a closing span tag
         if word.endswith("</span>"):
             end_index = index
+            if start_index is not None:
+                # Add the range of indices to the result list.
+                highlighted_index.extend(
+                    list(
+                        range(
+                            start_index,
+                            end_index + 1,
+                        ),
+                    ),
+                )
+        start_index = None
+        end_index = None
     return highlighted_index
+def extract_entities(text: str):
+    """
+    Extracts named entities from the given text.
+    Args:
+        text (str): The input text to extract entities from.
+    Returns:
+        list: A list of unique extracted entities (string).
+    """
+    # Apply the Named Entity Recognition (NER) pipeline to the input text.
     output = ner_pipeline(text)
+    # Extract words from the NER pipeline output.
     words = extract_words(output)
+    # Combine subwords into complete words.
     words = combine_subwords(words)
+    # Append the entities if it's not a duplicate.
     entities = []
     for entity in words:
         if entity not in entities:
     return entities
+def extract_words(entities: list[dict]) -> list[str]:
     """
     Extracts the words from a list of entities.
     Args:
+        entities (list): A list of entities,
+            where each entity is expected to be a dictionary
+            containing a "word" key.
     Returns:
+        list[str]: A list of words extracted from the entities.
     """
     words = []
     for entity in entities:
     Combines subwords (indicated by "##") with the preceding word in a list.
     Args:
+        word_list (list): A list of words,
+            where subwords are prefixed with "##".
     Returns:
+        list: A new list with subwords combined with their preceding words
+            and hyphenated words combined.
     """
     result = []
     i = 0
     while i < len(word_list):
         if word_list[i].startswith("##"):
+            # Remove "##" and append the remaining to the previous word
+            result[-1] += word_list[i][2:]
+        elif i < len(word_list) - 2 and word_list[i + 1] == "-":
+            # Combine the current word, the hyphen, and the next word.
             result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
+            i += 2  # Skip the next two words (hyphen and the following word)
         else:
+            # If neither a subword nor a hyphenated word,
+            # append the current word to the result list.
             result.append(word_list[i])
         i += 1
     return result
 on Thursday, amid wider concerns that the next US President, who is
 set to take office on Monday, could potentially reduce aid.
     """
 if __name__ == "__main__":
     with gr.Blocks() as demo:
         gr.Markdown("### Highlight Matching Parts Between Two Texts")

src/application/text/helper.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import re
 import string
 from collections import Counter
@@ -8,9 +13,18 @@ from nltk.util import ngrams
 from sklearn.feature_extraction.text import TfidfVectorizer
-def clean_text(text):
-    """Doc cleaning"""
-    # exclude , and . due to number
     punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
     # Lowering text
@@ -22,23 +36,51 @@ def clean_text(text):
     # Removing whitespace and newlines
     text = re.sub(r"\s+", " ", text)
     text.replace("£", " * ")
     words = text.split()
-    text = " ".join(words[:18])  # Join the first 18 words back into a string
     return text
-def remove_punctuation(text):
-    """Remove punctuation from a given text."""
     punctuation_without_dot = string.punctuation.replace(".", "")
     translator = str.maketrans("", "", punctuation_without_dot)
     return text.translate(translator)
 def get_keywords(text, num_keywords=5):
-    """Return top k keywords from a doc using TF-IDF method"""
     # Create a TF-IDF Vectorizer
     vectorizer = TfidfVectorizer(stop_words="english")
@@ -142,41 +184,76 @@ def extract_important_phrases(
     return important_phrases
-def extract_equal_text(text1, text2):
-    def cleanup(text):
         text = text.lower()
         text = text.translate(str.maketrans("", "", string.punctuation))
         return text
     splited_text1 = cleanup(text1).split()
     splited_text2 = cleanup(text2).split()
     s = SequenceMatcher(None, splited_text1, splited_text2)
     equal_idx_1 = []
     equal_idx_2 = []
     text1 = text1.split()
     text2 = text2.split()
     for tag, i1, i2, j1, j2 in s.get_opcodes():
         if tag == "equal":
             equal_idx_1.append({"start": i1, "end": i2})
             equal_idx_2.append({"start": j1, "end": j2})
-            subtext_1 = " ".join(text1[i1:i2])
-            subtext_2 = " ".join(text2[j1:j2])
             # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
             #       f'{subtext_1!r:>55} --> {subtext_2!r}')
     return equal_idx_1, equal_idx_2
-def connect_consecutive_indexes(nums):
     """
     Connects consecutive integers in a list.
     Args:
-        nums: A list of integers.
     Returns:
-        A list of lists, where each inner list represents a consecutive range.
     """
     if not nums:  # Handle empty input
@@ -187,12 +264,15 @@ def connect_consecutive_indexes(nums):
     end = nums[0]
     for i in range(1, len(nums)):
         if nums[i] == end + 1:
-            end = nums[i]
         else:
             result.append([start, end])
             start = nums[i]
             end = nums[i]
-    result.append([start, end])  # Add the last range
     return result

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
 import re
 import string
 from collections import Counter
 from sklearn.feature_extraction.text import TfidfVectorizer
+def clean_text(text: str) -> str:
+    """
+    Cleans and preprocesses a given text string.
+    Args:
+        text (str): The input text to be cleaned.
+    Returns:
+        str: The cleaned and preprocessed text, containing the first 18 words.
+    """
+    # Define a set of punctuation characters to exclude,
+    # exclude comma and period due to numbers
     punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
     # Lowering text
     # Removing whitespace and newlines
     text = re.sub(r"\s+", " ", text)
+    # Replace £ with * because Google search doesn't recognize £
     text.replace("£", " * ")
+    # Split the text into a list of words.
     words = text.split()
+    # Join the first 18 words back into a string
+    text = " ".join(words[:18])  # TODO: consider another number
     return text
+def remove_punctuation(text: str) -> str:
+    """
+    Removes all punctuation characters from a string, except for periods (.).
+    Args:
+        text (str): The input string.
+    Returns:
+        str: The string with all punctuation characters removed,
+            except for periods.
+    """
+    # Create a string containing all punctuation characters,
+    # except for periods.
     punctuation_without_dot = string.punctuation.replace(".", "")
+    # Create a translation table to remove the specified punctuation chars.
     translator = str.maketrans("", "", punctuation_without_dot)
+    # Apply the translation table to the input text and return the result.
     return text.translate(translator)
 def get_keywords(text, num_keywords=5):
+    """
+    Extracts the top k keywords from a document using the TF-IDF method.
+    Args:
+        text (str): The input text from which to extract keywords.
+        num_keywords (int, optional): The number of top keywords to return.
+    Returns:
+        list: A list of the top keywords extracted from the text.
+    """
     # Create a TF-IDF Vectorizer
     vectorizer = TfidfVectorizer(stop_words="english")
     return important_phrases
+def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
+    """
+    Extracts the indices of equal text segments between two strings.
+    Args:
+        text1 (str): The first input string.
+        text2 (str): The second input string.
+    Returns:
+        tuple[
+            list[dict{"start": int, "end": int}],
+            list[dict{"start": int, "end": int}]
+            ]
+            - list: the start and end indices of equal segments in text1.
+            - list: the start and end indices of equal segments in text2.
+    """
+    def cleanup(text: str) -> str:
+        """
+        Cleans up a text string by converting to lowercase
+            and removing punctuation.
+        Args:
+            text (str): The input text.
+        Returns:
+            str: The cleaned text.
+        """
         text = text.lower()
         text = text.translate(str.maketrans("", "", string.punctuation))
         return text
+    # Clean and split the input texts into lists of words.
     splited_text1 = cleanup(text1).split()
     splited_text2 = cleanup(text2).split()
+    # Create a SequenceMatcher object to compare the cleaned word lists.
     s = SequenceMatcher(None, splited_text1, splited_text2)
     equal_idx_1 = []
     equal_idx_2 = []
+    # Split the original texts into lists of words (without cleaning).
     text1 = text1.split()
     text2 = text2.split()
     for tag, i1, i2, j1, j2 in s.get_opcodes():
         if tag == "equal":
+            # Append the start and end indices of the equal segment
+            # to the respective lists.
             equal_idx_1.append({"start": i1, "end": i2})
             equal_idx_2.append({"start": j1, "end": j2})
+            # subtext_1 = " ".join(text1[i1:i2])
+            # subtext_2 = " ".join(text2[j1:j2])
             # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
             #       f'{subtext_1!r:>55} --> {subtext_2!r}')
     return equal_idx_1, equal_idx_2
+def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
     """
     Connects consecutive integers in a list.
     Args:
+        nums (list): A list of integers.
     Returns:
+        list: A list of lists,
+            where each inner list represents a consecutive range.
+            For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
     """
     if not nums:  # Handle empty input
     end = nums[0]
     for i in range(1, len(nums)):
+        # Check if the current number is consecutive to the previous end.
         if nums[i] == end + 1:
+            end = nums[i]  # Extend the current range.
         else:
+            # Add the current range to the result and start a new range.
             result.append([start, end])
             start = nums[i]
             end = nums[i]
+    # Add the last range to the result.
+    result.append([start, end])
     return result

src/application/text/highlight_text.py DELETED Viewed

@@ -1,202 +0,0 @@
-import colorsys
-import gradio as gr
-def lighten_color(hex_color, factor=1.8):
-    """Lightens a HEX color by increasing its brightness in HSV space."""
-    hex_color = hex_color.lstrip("#")
-    r, g, b = (
-        int(hex_color[0:2], 16),
-        int(hex_color[2:4], 16),
-        int(hex_color[4:6], 16),
-    )
-    # Convert to HSV
-    h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
-    v = min(1.0, v * factor)  # Increase brightness
-    # Convert back to HEX
-    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
-    return f"#{r:02x}{g:02x}{b:02x}"
-def darken_color(hex_color, factor=0.7):
-    """Darkens a hex color by reducing its brightness in the HSV space."""
-    hex_color = hex_color.lstrip("#")
-    r, g, b = (
-        int(hex_color[0:2], 16),
-        int(hex_color[2:4], 16),
-        int(hex_color[4:6], 16),
-    )
-    # Convert to HSV to adjust brightness
-    h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
-    v = max(0, v * factor)  # Reduce brightness
-    # Convert back to HEX
-    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
-    return f"#{r:02x}{g:02x}{b:02x}"
-# Generate unique colors for pairs
-def generate_color(index, total_colors=20):
-    """Generates a unique, evenly spaced color for each index using HSL."""
-    hue = index / total_colors  # Spread hues in range [0,1]
-    saturation = 0.65  # Keep colors vivid
-    lightness = 0.75  # Balanced brightness
-    # Convert HSL to RGB
-    r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
-    r, g, b = int(r * 255), int(g * 255), int(b * 255)
-    return f"#{r:02x}{g:02x}{b:02x}"  # Convert to hex
-def highlight_pairs(text1, text2):
-    """Highlight matching pairs between two texts"""
-    # Predefined matching pairs
-    match_pairs = [
-        {
-            "index": 1,
-            "text1": "deep learning",
-            "start1": 13,
-            "end1": 26,
-            "text2": "deep learning",
-            "start2": 12,
-            "end2": 25,
-        },
-        {
-            "index": 2,
-            "text1": "neural networks",
-            "start1": 56,
-            "end1": 71,
-            "text2": "neural networks",
-            "start2": 68,
-            "end2": 83,
-        },
-        {
-            "index": 3,
-            "text1": "AI research",
-            "start1": 86,
-            "end1": 97,
-            "text2": "AI research",
-            "start2": 55,
-            "end2": 66,
-        },
-    ]
-    # Assign unique colors to each index
-    pair_colors = {
-        pair["index"]: generate_color(
-            pair["index"],
-            total_colors=len(match_pairs),
-        )
-        for pair in match_pairs
-    }
-    def apply_highlight(
-        text,
-        pairs,
-        key_start,
-        key_end,
-        key_index,
-        pair_colors,
-    ):
-        highlighted_text = ""
-        prev_end = 0
-        for pair in sorted(pairs, key=lambda x: x[key_start]):
-            start, end, index = pair[key_start], pair[key_end], pair[key_index]
-            color = pair_colors.get(
-                index,
-                "#ddd",
-            )  # Default color if not found
-            color = lighten_color(
-                color,
-                factor=2.2,
-            )  # Lightened color for background text
-            label_color = darken_color(
-                color,
-                factor=0.7,
-            )  # Make label color darker
-            # Style the index as a label
-            index_label = (
-                f'<span style="background-color:{label_color}; color:white; '
-                f"padding:1px 4px; border-radius:4px; font-size:12px; "
-                f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>'  # noqa: E501
-            )
-            # Append non-highlighted text
-            highlighted_text += text[prev_end:start]
-            # Append highlighted text with index label
-            highlighted_text += (
-                f'<span style="background-color:{color}; '
-                f'border-radius:3px; font-size:14px; display:inline-block;">'
-                f"{index_label} {text[start:end]}</span>"
-            )
-            prev_end = end
-        # Append remaining text
-        highlighted_text += text[prev_end:]
-        return highlighted_text
-    # Apply highlighting to both texts using the global MATCH_PAIRS
-    highlighted_text1 = apply_highlight(
-        text1,
-        match_pairs,
-        "start1",
-        "end1",
-        "index",
-        pair_colors,
-    )
-    highlighted_text2 = apply_highlight(
-        text2,
-        match_pairs,
-        "start2",
-        "end2",
-        "index",
-        pair_colors,
-    )
-    return highlighted_text1, highlighted_text2
-if __name__ == "__main__":
-    # Create Gradio Interface
-    text1 = ""
-    with gr.Blocks() as demo:
-        gr.Markdown("### Highlight Matching Parts Between Two texts")
-        text1_input = gr.Textbox(
-            label="Text 1",
-            lines=5,
-            value="""
-The field of deep learning is advancing rapidly.
-Modern neural networks are improving AI research significantly.
-""",
-        )
-        text2_input = gr.Textbox(
-            label="Text 2",
-            lines=5,
-            value="""
-Advances in deep learning have led to breakthroughs in AI research.
-Neural networks are at the core of these innovations",
-            """,
-        )
-        output1 = gr.HTML()
-        output2 = gr.HTML()
-        submit_button = gr.Button("Highlight Matches")
-        submit_button.click(
-            fn=highlight_pairs,
-            inputs=[text1_input, text2_input],
-            outputs=[output1, output2],
-        )
-    # Launch the Gradio app
-    demo.launch()

src/application/text/model_detection.py CHANGED Viewed

@@ -1,44 +1,24 @@
-import os
-import torch
-from dotenv import load_dotenv
-from openai import (
-    AzureOpenAI,
-    OpenAIError,
-)
-from sentence_transformers import (
-    SentenceTransformer,
-    util,
-)
 from transformers import pipeline
-load_dotenv()
-AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
-AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
-AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
-azure_client = AzureOpenAI(
-    azure_endpoint="https://quoc-nguyen.openai.azure.com/",
-    api_key=AZURE_OPENAI_API_KEY,
-    api_version="2024-05-01-preview",
 )
-# TODO: move to a config file
-# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
-AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
-MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
-HUMAN = "HUMAN"
-MACHINE = "MACHINE"
-UNKNOWN = "UNKNOWN"
-PARAPHRASE = "PARAPHRASE"
-NON_PARAPHRASE = "NON_PARAPHRASE"
-# load the embedding model
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
-PARAPHASE_MODEL.to(DEVICE)
 def detect_text_by_ai_model(
     input_text: str,
@@ -51,29 +31,43 @@ def detect_text_by_ai_model(
     Detects if text is human or machine generated.
     Returns:
         tuple: (label, confidence_score)
             where label is HUMAN or MACHINE.
     """
     try:
         pipe = pipeline(
             "text-classification",
             model=model,
             tokenizer=model,
-            max_length=max_length,
             truncation=True,
             device_map="auto",  # good for GPU usage
         )
         input_text = input_text.replace("<br>", " ")
         result = pipe(input_text)[0]
         confidence_score = result["score"]
         if result["label"] == MODEL_HUMAN_LABEL[model]:
             label = HUMAN
         else:
             label = MACHINE
             generated_model, _ = predict_generation_model(input_text)
             label += f"<br>({generated_model})"
         return label, confidence_score
     except Exception as e:  # Add exception handling
         print(f"Error in Roberta model inference: {e}")
         return UNKNOWN, 0.5  # Return UNKNOWN and 0.0 confidence if error
@@ -82,20 +76,31 @@ def detect_text_by_ai_model(
 def predict_generation_model(text: str) -> tuple[str, float]:
     """
     Predicts if text is generated by gpt-4o or gpt-4o-mini models.
-    Compare the input text against the paraphrased text by the models.
     Returns:
         tuple: (label, confidence_score)
-            where label is gpt-4o or gpt-4o-mini.
     """
     best_similarity = 0
-    best_model = "gpt-4o"
-    models = ["gpt-4o", "gpt-4o-mini"]
-    for model in models:
         paraphrased_text = paraphrase_by_AI(text, model)
         if paraphrased_text is None:
             continue
         similarity = measure_text_similarity(text, paraphrased_text)
         if similarity > best_similarity:
             best_similarity = similarity
             best_model = model
@@ -105,10 +110,14 @@ def predict_generation_model(text: str) -> tuple[str, float]:
 def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
     """
-    Paraphrase text using a given model.
     Returns:
-        str: Paraphrased text.
     """
     prompt = f"""
@@ -116,18 +125,19 @@ Paraphrase the following news, only output the paraphrased text:
 {input_text}
 """
     try:
-        response = azure_client.chat.completions.create(
             model=model,
             messages=[
                 {"role": "user", "content": prompt},
             ],
-            # max_tokens=100,
-            # temperature=0.7,
-            # top_p=0.9,
-            # n=1,
         )
         paraphrased_text = response.choices[0].message.content
         return paraphrased_text
     except OpenAIError as e:  # Add exception handling
         print(f"Error in AI model inference: {e}")
         return None
@@ -135,18 +145,24 @@ Paraphrase the following news, only output the paraphrased text:
 def measure_text_similarity(text1: str, text2: str) -> float:
     """
-    Measure the similarity between two texts.
     Returns:
-        float: Similarity score.
     """
-    embeddings1 = PARAPHASE_MODEL.encode(
         text1,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     )
-    embeddings2 = PARAPHASE_MODEL.encode(
         text2,
         convert_to_tensor=True,
         device=DEVICE,
@@ -155,5 +171,4 @@ def measure_text_similarity(text1: str, text2: str) -> float:
     # Compute cosine similarity matrix
     similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
-    print(similarity[0][0])
     return similarity[0][0]

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
+from openai import OpenAIError
+from sentence_transformers import util
 from transformers import pipeline
+from src.application.config import (
+    AI_TEXT_DECTECTION_MODEL,
+    AZUREOPENAI_CLIENT,
+    DEVICE,
+    GPT_PARAPHRASE_MODELS,
+    HUMAN,
+    MACHINE,
+    MODEL_HUMAN_LABEL,
+    PARAPHRASE_MODEL,
+    UNKNOWN,
 )
 def detect_text_by_ai_model(
     input_text: str,
     Detects if text is human or machine generated.
+    Args:
+        input_text (str): The text to be classified.
+        model (str, optional): The name of the AI text detection model.
+        max_length (int, optional): The maximum length of the input text.
     Returns:
         tuple: (label, confidence_score)
             where label is HUMAN or MACHINE.
     """
     try:
+        # Create a text classification pipeline using the specified model.
         pipe = pipeline(
             "text-classification",
             model=model,
             tokenizer=model,
+            max_length=max_length,  # TODO: consider: removal
             truncation=True,
             device_map="auto",  # good for GPU usage
         )
+        # Replace HTML line breaks with spaces to improve processing.
         input_text = input_text.replace("<br>", " ")
+        # Perform text classification using the pipeline.
         result = pipe(input_text)[0]
         confidence_score = result["score"]
+        # Determine the label based on the model's prediction.
         if result["label"] == MODEL_HUMAN_LABEL[model]:
             label = HUMAN
         else:
             label = MACHINE
             generated_model, _ = predict_generation_model(input_text)
             label += f"<br>({generated_model})"
         return label, confidence_score
     except Exception as e:  # Add exception handling
         print(f"Error in Roberta model inference: {e}")
         return UNKNOWN, 0.5  # Return UNKNOWN and 0.0 confidence if error
 def predict_generation_model(text: str) -> tuple[str, float]:
     """
     Predicts if text is generated by gpt-4o or gpt-4o-mini models.
+    Compares the input text against the paraphrased text by the models.
+    Args:
+        text (str): The input text to be analyzed.
     Returns:
         tuple: (label, confidence_score)
+               where label is gpt-4o or gpt-4o-mini,
+               and confidence_score is the highest similarity.
     """
     best_similarity = 0
+    best_model = GPT_PARAPHRASE_MODELS[0]
+    for model in GPT_PARAPHRASE_MODELS:
+        # Generate paraphrased text using the current model.
         paraphrased_text = paraphrase_by_AI(text, model)
+        # Skip to the next model if paraphrasing fails (returns None).
         if paraphrased_text is None:
             continue
+        # Similarity between the original text and the paraphrased text.
         similarity = measure_text_similarity(text, paraphrased_text)
+        # Update the best similarity
         if similarity > best_similarity:
             best_similarity = similarity
             best_model = model
 def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
     """
+    Paraphrases text using a given AI model.
+    Args:
+        input_text (str): The text to be paraphrased.
+        model (str, optional): The AI model to use for paraphrasing.
     Returns:
+        str: The paraphrased text, or None if an error occurs.
     """
     prompt = f"""
 {input_text}
 """
     try:
+        response = AZUREOPENAI_CLIENT.chat.completions.create(
             model=model,
             messages=[
                 {"role": "user", "content": prompt},
             ],
+            # max_tokens=100,  # Limit the number of tokens in the response.
+            # temperature=0.7,  # Control the randomness of the response.
+            # top_p=0.9,  # Control the nucleus sampling.
+            # n=1,  # Generate multiple responses.
         )
         paraphrased_text = response.choices[0].message.content
         return paraphrased_text
     except OpenAIError as e:  # Add exception handling
         print(f"Error in AI model inference: {e}")
         return None
 def measure_text_similarity(text1: str, text2: str) -> float:
     """
+    Measures the similarity between two texts
+        using cosine similarity of their sentence embeddings.
+    Args:
+        text1 (str): The first text string.
+        text2 (str): The second text string.
     Returns:
+        float: The cosine similarity score between the two texts.
     """
+    # Generate sentence embeddings
+    embeddings1 = PARAPHRASE_MODEL.encode(
         text1,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     )
+    embeddings2 = PARAPHRASE_MODEL.encode(
         text2,
         convert_to_tensor=True,
         device=DEVICE,
     # Compute cosine similarity matrix
     similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
     return similarity[0][0]

src/application/text/preprocessing.py CHANGED Viewed

@@ -1,46 +1,67 @@
 from nltk.tokenize import sent_tokenize
-def split_into_sentences(input_text):
     """
-    Splits input text into sentences by newlines.
     Args:
-        input_text: The input text as a string.
     Returns:
-        A list of sentences. Returns an empty list if input is not valid.
     """
     if not isinstance(input_text, str):
         return []
     paragraphs = input_text.splitlines(keepends=True)
     sentences = []
     for paragraph in paragraphs:
         paragraph = paragraph.strip()
         if paragraph and paragraph != "\n":
             sentences.extend(sent_tokenize(paragraph))
     return sentences
-def split_into_paragraphs(input_text):
     """
-    Splits input text into sentences by newlines.
     Args:
-        input_text: The input text as a string.
     Returns:
-        A list of sentences. Returns an empty list if input is not valid.
     """
     if not isinstance(input_text, str):
         return []
     paragraphs = input_text.splitlines(keepends=True)
     out_paragraphs = []
     for paragraph in paragraphs:
         paragraph = paragraph.strip()
         if paragraph and paragraph != "\n":
             out_paragraphs.append(paragraph)
-    print(f"paragraphs: {out_paragraphs}")
-    return out_paragraphs

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
 from nltk.tokenize import sent_tokenize
+# TODO: consider moving to helpers
+def split_into_sentences(input_text: str) -> list[str]:
     """
+    Splits input text into sentences by newlines
+        and then tokenizes each paragraph into sentences.
     Args:
+        input_text (str): The input text as a string.
     Returns:
+        list: A list of sentences.
+            Returns an empty list if input is not a string.
     """
     if not isinstance(input_text, str):
         return []
+    # Split the input text into paragraphs based on newline characters,
+    # keeping the newline characters.
     paragraphs = input_text.splitlines(keepends=True)
     sentences = []
     for paragraph in paragraphs:
+        # Remove leading/trailing whitespace
         paragraph = paragraph.strip()
         if paragraph and paragraph != "\n":
+            # Tokenize the paragraph into sentences
             sentences.extend(sent_tokenize(paragraph))
     return sentences
+def split_into_paragraphs(input_text: str) -> list[str]:
     """
+    Splits input text into paragraphs based on newline characters.
     Args:
+        input_text (str): The input text as a string.
     Returns:
+        list: A list of paragraphs.
+            Returns an empty list if input is not a string.
     """
     if not isinstance(input_text, str):
         return []
+    # Split the input text into paragraphs based on newline characters,
+    # keeping the newline characters.
     paragraphs = input_text.splitlines(keepends=True)
     out_paragraphs = []
     for paragraph in paragraphs:
+        # Remove leading/trailing whitespace
         paragraph = paragraph.strip()
         if paragraph and paragraph != "\n":
+            # Append the cleaned paragraph to the output list.
             out_paragraphs.append(paragraph)
+    return out_paragraphs

src/application/text/search.py CHANGED Viewed

@@ -1,38 +1,50 @@
-import os
 import string
 from collections import Counter
 import requests
-from dotenv import load_dotenv
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer
 from src.application.text.entity import extract_entities
-load_dotenv()
-GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
-SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
 def search_by_google(
     query,
-    num_results=10,
     is_exact_terms=False,
 ) -> dict:
     """
-    Searches the Google Custom Search Engine for the given query.
     Args:
-        query: The search query.
-        is_exact_terms: Whether to use exact terms search (True) or not.
-        num_results: The number of results to return (default: 10).
     Returns:
-        A dict containing the search results or None if there was an error.
     """
-    url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "key": GOOGLE_API_KEY,
         "cx": SEARCH_ENGINE_ID,
@@ -43,7 +55,7 @@ def search_by_google(
     else:
         params["q"] = query.replace('"', "")
-    response = requests.get(url, params=params)
     if response.status_code == 200:
         return response.json()
     else:
@@ -51,26 +63,35 @@ def search_by_google(
         return None
-def get_most_frequent_words(input_text, number_word=32):
     """
-    Gets the top words from the input text,
-        excluding stop words and punctuation.
     Args:
-        input_text: The input text as a string.
-        number_word: The number of top words to return.
     Returns:
-        A list of tuples, where each tuple contains a word and its frequency.
-        Returns an empty list if input is not a string or is empty.
     """
     if not isinstance(input_text, str) or not input_text:
-        return []
-    words = word_tokenize(input_text.lower())  # Tokenize and lowercase
-    stop_words = set(stopwords.words("english"))
-    punctuation = set(string.punctuation)  # get all punctuation
     filtered_words = [
         word
         for word in words
@@ -78,32 +99,40 @@ def get_most_frequent_words(input_text, number_word=32):
         and word not in stop_words
         and word not in punctuation
     ]
     word_frequencies = Counter(filtered_words)
     top_words = word_frequencies.most_common(number_word)
     for top_word in top_words:
         words.append(top_word[0])
-    if len(words) > 32:
-        search_phrase = " ".join(words[:32])
     else:
         search_phrase = " ".join(words[:number_word])
     return search_phrase
-def get_chunk(input_text, chunk_length=32, num_chunk=3):
     """
-    Splits the input text into chunks of a specified length.
     Args:
-        input_text: The input text as a string.
-        num_chunk: The maximum number of chunks to create.
-        chunk_length: The desired length of each chunk (in words).
     Returns:
-        A list of string chunks.
-        Returns an empty list if input is invalid.
     """
     if not isinstance(input_text, str):
         return []
@@ -112,8 +141,11 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
     input_words = input_text.split()  # Split by any whitespace
     for i in range(num_chunk):
-        start_index = i * chunk_length
-        end_index = (i + 1) * chunk_length
         chunk = " ".join(input_words[start_index:end_index])
         if chunk:  # Only append non-empty chunks
             chunks.append(chunk)
@@ -121,11 +153,20 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
     return chunks
-def get_keywords(text, num_keywords=5):
-    """Return top k keywords from a doc using TF-IDF method"""
     # Create a TF-IDF Vectorizer
-    vectorizer = TfidfVectorizer(stop_words="english")
     # Fit and transform the text
     tfidf_matrix = vectorizer.fit_transform([text])
@@ -144,7 +185,7 @@ def get_keywords(text, num_keywords=5):
     return [word for word, score in word_scores[:num_keywords]]
-def generate_search_phrases(input_text):
     """
     Generates different types of phrases for search purposes.
@@ -156,6 +197,7 @@ def generate_search_phrases(input_text):
         - A list of most frequent words.
         - The original input text.
         - A list of text chunks.
     """
     if not isinstance(input_text, str):
         return []
@@ -171,7 +213,7 @@ def generate_search_phrases(input_text):
     # Method 3: Split text by chunks
     search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes
-    # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
     text_without_entities = remove_identities_from_text(input_text, entities)
     search_phrases.append(text_without_entities)
@@ -182,7 +224,7 @@ def generate_search_phrases(input_text):
     return search_phrases
-def remove_identities_from_text(input_text, entities):
     """
     Removes entities from the input text.

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
 import string
 from collections import Counter
 import requests
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer
+from src.application.config import (
+    CHUNK_SIZE,
+    GOOGLE_API_KEY,
+    GOOGLE_ENDPOINT_URL,
+    NUM_CHUNKS,
+    NUM_FREQUENT_WORDS,
+    NUM_KEYWORDS,
+    SEARCH_ENGINE_ID,
+    STOPWORDS_LANG,
+    TOP_SEARCH_RESUTLS,
+)
 from src.application.text.entity import extract_entities
 def search_by_google(
     query,
+    num_results=TOP_SEARCH_RESUTLS,
     is_exact_terms=False,
 ) -> dict:
     """
+    Performs a Google Custom Search API query.
     Args:
+        query (str): The search query string.
+        num_results (int, optional): The number of search results to return.
+            Defaults to TOP_SEARCH_RESUTLS.
+        is_exact_terms (bool, optional): use an exact phrase search or not.
+            Defaults to False.
     Returns:
+        dict: JSON response from the Google Custom Search API,
+            None if an error occurs.
     """
     params = {
         "key": GOOGLE_API_KEY,
         "cx": SEARCH_ENGINE_ID,
     else:
         params["q"] = query.replace('"', "")
+    response = requests.get(GOOGLE_ENDPOINT_URL, params=params)
     if response.status_code == 200:
         return response.json()
     else:
         return None
+def get_most_frequent_words(
+    input_text: str,
+    number_word: int = NUM_FREQUENT_WORDS,
+) -> str:
     """
+    Extracts the most frequent words from the input text
+        and forms a search phrase.
     Args:
+        input_text (str): The text from which to extract frequent words.
+        number_word (int, optional): The number of frequent words to extract.
     Returns:
+        str: A search phrase consisting of the most frequent words.
     """
+    # Check if the input text is valid
     if not isinstance(input_text, str) or not input_text:
+        return None
+    # Tokenize the input text into words and convert to lowercase
+    words = word_tokenize(input_text.lower())
+    # Get the set of stop words for the specified language
+    stop_words = set(stopwords.words(STOPWORDS_LANG))
+    # Get the set of punctuation characters
+    punctuation = set(string.punctuation)
+    # Filter out stop words, punctuation, and non-alphanumeric words
     filtered_words = [
         word
         for word in words
         and word not in stop_words
         and word not in punctuation
     ]
+    # Count the frequency of each filtered word
     word_frequencies = Counter(filtered_words)
+    # Get the most common words and their frequencies
     top_words = word_frequencies.most_common(number_word)
     for top_word in top_words:
         words.append(top_word[0])
+    # Construct the search phrase
+    if len(words) > NUM_FREQUENT_WORDS:
+        search_phrase = " ".join(words[:NUM_FREQUENT_WORDS])
     else:
         search_phrase = " ".join(words[:number_word])
     return search_phrase
+def get_chunk(
+    input_text: str,
+    chunk_size: int = CHUNK_SIZE,
+    num_chunk: int = NUM_CHUNKS,
+) -> list[str]:
     """
+    Splits the input text into chunks of a specified size.
     Args:
+        input_text (str): The text to be chunked.
+        chunk_size (int, optional): The number of words per chunk.
+        num_chunk (int, optional): The number of chunks to generate.
     Returns:
+        list: A list of chunks of the input text.
     """
     if not isinstance(input_text, str):
         return []
     input_words = input_text.split()  # Split by any whitespace
     for i in range(num_chunk):
+        # Calculate the start and end indices for the current chunk
+        start_index = i * chunk_size
+        end_index = (i + 1) * chunk_size
+        # Extract the words for the current chunk and join them into a string
         chunk = " ".join(input_words[start_index:end_index])
         if chunk:  # Only append non-empty chunks
             chunks.append(chunk)
     return chunks
+def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]:
+    """
+    Extracts the top keywords from a given text using the TF-IDF method.
+    Args:
+        text (str): The input text from which to extract keywords.
+        num_keywords (int, optional): The number of top keywords to return.
+    Returns:
+        list: A list of strings representing the top keywords extracted
+            from the text.
+    """
     # Create a TF-IDF Vectorizer
+    vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG)
     # Fit and transform the text
     tfidf_matrix = vectorizer.fit_transform([text])
     return [word for word, score in word_scores[:num_keywords]]
+def generate_search_phrases(input_text: str) -> list[str]:
     """
     Generates different types of phrases for search purposes.
         - A list of most frequent words.
         - The original input text.
         - A list of text chunks.
+        - A text without entities.
     """
     if not isinstance(input_text, str):
         return []
     # Method 3: Split text by chunks
     search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes
+    # Method 4: Remove identities and key words
     entities = extract_entities(input_text)
     text_without_entities = remove_identities_from_text(input_text, entities)
     search_phrases.append(text_without_entities)
     return search_phrases
+def remove_identities_from_text(input_text: str, entities: list[str]) -> str:
     """
     Removes entities from the input text.

src/application/text/search_detection.py CHANGED Viewed

@@ -1,14 +1,22 @@
 import warnings
-from difflib import SequenceMatcher
-import nltk
 import numpy as np
-import torch
-from sentence_transformers import (
-    SentenceTransformer,
-    util,
 )
 from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search import (
     generate_search_phrases,
@@ -18,39 +26,43 @@ from src.application.url_reader import URLReader
 warnings.simplefilter(action="ignore", category=FutureWarning)
-# Download necessary NLTK data files
-nltk.download("punkt", quiet=True)
-nltk.download("punkt_tab", quiet=True)
-nltk.download("stopwords", quiet=True)
-# load the model
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
-PARAPHASE_MODEL.to(DEVICE)
-PARAPHRASE_THRESHOLD_HUMAN = 0.963
-PARAPHRASE_THRESHOLD_MACHINE = 0.8
-PARAPHRASE_THRESHOLD = 0.8
-MIN_SAME_SENTENCE_LEN = 6
-MIN_PHRASE_SENTENCE_LEN = 10
-MIN_RATIO_PARAPHRASE_NUM = 0.5
-MAX_CHAR_SIZE = 30000
-def find_sentence_source(text, text_index, sentences_df):
-    checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
         urls = [item["link"] for item in search_results.get("items", [])]
-        for url in urls[:3]:
-            if url in checked_urls:  # visited url
                 continue
-            if "bbc.com" not in url:
                 continue
             checked_urls.add(url)
@@ -96,13 +108,13 @@ def find_sentence_source(text, text_index, sentences_df):
                     if c in sentences_df.columns:
                         sentences_df.loc[text_index, c] = aligned_sentence[c]
                 for idx, _ in sentences_df.iterrows():
                     similarity = sentences_df.loc[idx, "similarity"]
                     if similarity is not None:
                         if similarity > PARAPHRASE_THRESHOLD_MACHINE:
                             continue
-                    # find matched content in new url
                     aligned_sentence = check_paraphrase(
                         text[idx],
                         source_text,
@@ -125,141 +137,56 @@ def find_sentence_source(text, text_index, sentences_df):
                                 sentences_df.loc[idx, c] = aligned_sentence[c]
                 return sentences_df, content.images
     sentences_df.loc[text_index, "input"] = text[text_index]
     return sentences_df, []
-def longest_common_subsequence(arr1, arr2):
-    """
-    Finds the length of the longest common subsequence (contiguous) between
-        two arrays.
-    Args:
-        arr1: The first array.
-        arr2: The second array.
-    Returns:
-        The length of the longest common subsequence.
-        Returns 0 if either input is invalid.
-    """
-    if not isinstance(arr1, list) or not isinstance(arr2, list):
-        return 0
-    n = len(arr1)
-    m = len(arr2)
-    if n == 0 or m == 0:  # handle empty list
-        return 0
-    # Create table dp with size (n+1) x (m+1)
-    dp = [[0] * (m + 1) for _ in range(n + 1)]
-    max_length = 0
-    for i in range(1, n + 1):
-        for j in range(1, m + 1):
-            if arr1[i - 1] == arr2[j - 1]:
-                dp[i][j] = dp[i - 1][j - 1] + 1
-                max_length = max(max_length, dp[i][j])
-            else:
-                dp[i][j] = 0  # set 0 since the array must be consecutive
-    return max_length
-def check_sentence(
-    input_sentence,
-    source_sentence,
-    min_same_sentence_len,
-    min_phrase_sentence_len,
-    verbose=False,
-):
-    """
-    Checks if two sentences are similar based on exact match or
-        longest common subsequence.
-    Args:
-        input_sentence: The input sentence.
-        source_sentence: The source sentence.
-        min_same_sentence_len: Minimum length for exact sentence match.
-        min_phrase_sentence_len: Minimum length for common subsequence match.
-        verbose: If True, print debug information.
-    Returns:
-        True if the sentences are considered similar, False otherwise.
-        Returns False if input is not valid.
-    """
-    if not isinstance(input_sentence, str) or not isinstance(
-        source_sentence,
-        str,
-    ):
-        return False
-    input_sentence = input_sentence.strip()
-    source_sentence = source_sentence.strip()
-    if not input_sentence or not source_sentence:  # handle empty string
-        return False
-    input_words = input_sentence.split()  # split without arguments
-    source_words = source_sentence.split()  # split without arguments
-    if (
-        input_sentence == source_sentence
-        and len(input_words) >= min_same_sentence_len
-    ):
-        if verbose:
-            print("Exact match found.")
-        return True
-    max_overlap_len = longest_common_subsequence(input_words, source_words)
-    if verbose:
-        print(f"Max overlap length: {max_overlap_len}")  # print overlap length
-    if max_overlap_len >= min_phrase_sentence_len:
-        return True
-    return False
-def check_paraphrase(input_text, source_text, url):
     """
-    Checks if the input text is paraphrased in the content at the given URL.
     Args:
-        input_text: The text to check for paraphrase.
-        page_text: The text of the web page to compare with.
-        url
     Returns:
-        A tuple containing:
     """
     # Extract sentences from input text and web page
     input_sentences = split_into_sentences(input_text)
     if not source_text:
         return {}
     source_sentences = split_into_sentences(source_text)
     if not input_sentences or not source_sentences:
         return {}
     additional_sentences = []
     for sentence in source_sentences:
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
     source_sentences.extend(additional_sentences)
-    # Encode sentences into embeddings
-    embeddings1 = PARAPHASE_MODEL.encode(
         input_sentences,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     )
-    embeddings2 = PARAPHASE_MODEL.encode(
         source_sentences,
         convert_to_tensor=True,
         device=DEVICE,
@@ -272,78 +199,53 @@ def check_paraphrase(input_text, source_text, url):
     # Find sentence alignments
     inputs = ""
     sources = ""
-    similarities = []
     for i, sentence in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         best_matched_sentence = source_sentences[max_sim_index]
         inputs += sentence + " "
         sources += best_matched_sentence + " "
         similarities.append(max_similarity)
     similarity = sum(similarities) / len(similarities)
     label, is_paraphrased = determine_label(max_similarity)
     alignment = {
-            "input": inputs,
-            "source": sources,
-            "similarity": similarity,
-            "label": label,
-            "paraphrase": is_paraphrased,
-            "url": url,
-        }
     print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
     return alignment
-def similarity_ratio(a, b):
     """
-    Calculates the similarity ratio between two strings using SequenceMatcher.
     Args:
-        a: The first string.
-        b: The second string.
-    Returns:
-        A float representing the similarity ratio between 0.0 and 1.0.
-        Returns 0.0 if either input is None or not a string.
-    """
-    if (
-        not isinstance(a, str)
-        or not isinstance(b, str)
-        or a is None
-        or b is None
-    ):
-        return 0.0  # Handle cases where inputs are not strings or None
-    return SequenceMatcher(None, a, b).ratio()
-def check_human(alligned_sentences):
-    """
-    Checks if a sufficient number of input sentences are found within
-        source sentences.
     Returns:
-        bool: True if the condition is met, False otherwise.
     """
-    if not alligned_sentences:  # Handle empty data case
-        return False
-    if alligned_sentences["similarity"] >= 0.99:
-        return True
-    return False
-def determine_label(similarity):
     if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
-        return "HUMAN", True
     elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
-        return "MACHINE", True
     else:
-        return None, False
 if __name__ == "__main__":

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
 import warnings
 import numpy as np
+from pandas import DataFrame
+from sentence_transformers import util
+from src.application.config import (
+    DEVICE,
+    MAX_CHAR_SIZE,
+    PARAPHRASE_MODEL,
+    PARAPHRASE_THRESHOLD_HUMAN,
+    PARAPHRASE_THRESHOLD_MACHINE,
+    TOP_URLS_PER_SEARCH,
 )
 from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search import (
     generate_search_phrases,
 warnings.simplefilter(action="ignore", category=FutureWarning)
+def find_sentence_source(
+    text: list,
+    text_index: str,
+    sentences_df: DataFrame,
+) -> tuple[DataFrame, list]:
+    """
+    Finds the source URL for a given sentence by searching Google
+    and checking for paraphrases.
+    Args:
+        text (list): A list of sentences.
+        text_index (int): The index of the sentence to find the source for.
+        sentences_df (pd.DataFrame): A DF to store sentence information.
+    Returns:
+        tuple: A tuple of the updated sentences_df and a list of image URLs.
+            If a source is found, the DF is updated with source information.
+            If no source is found, the DF is updated with the original input.
+    """
+    checked_urls = (
+        set()
+    )  # Keep track of visited URLs to avoid redundant checks
     searched_phrases = generate_search_phrases(text[text_index])
     for candidate in searched_phrases:
+        # Search Google for the generated phrase
         search_results = search_by_google(candidate)
+        # Extract URLs from search results
         urls = [item["link"] for item in search_results.get("items", [])]
+        # Check the top 3 URLs from the search results
+        for url in urls[:TOP_URLS_PER_SEARCH]:
+            if url in checked_urls:  # Skip already checked URLs
                 continue
+            if "bbc.com" not in url:  # TODO: remove when releasing
                 continue
             checked_urls.add(url)
                     if c in sentences_df.columns:
                         sentences_df.loc[text_index, c] = aligned_sentence[c]
+                # Check other sentences for better matches in the same source
                 for idx, _ in sentences_df.iterrows():
                     similarity = sentences_df.loc[idx, "similarity"]
                     if similarity is not None:
                         if similarity > PARAPHRASE_THRESHOLD_MACHINE:
                             continue
                     aligned_sentence = check_paraphrase(
                         text[idx],
                         source_text,
                                 sentences_df.loc[idx, c] = aligned_sentence[c]
                 return sentences_df, content.images
+    # If no source is found, update the DF with the original input
     sentences_df.loc[text_index, "input"] = text[text_index]
     return sentences_df, []
+def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
     """
+    Checks if the input text is a paraphrase of the source text
+        by comparing sentence-level similarities.
     Args:
+        input_text (str): The text to be checked for paraphrasing.
+        source_text (str): The source text to compare against.
+        url (str): The URL of the source text (for storing in the result).
     Returns:
+        dict: A dictionary containing the alignment information, including:
+                - "input": Concatenated input sentences.
+                - "source": Concatenated best-matched source sentences.
+                - "similarity": Average cosine similarity score.
+                - "label": Label determined based on similarity.
+                - "paraphrase": Boolean indicating if it's a paraphrase.
+                - "url": The source URL.
     """
     # Extract sentences from input text and web page
     input_sentences = split_into_sentences(input_text)
     if not source_text:
         return {}
     source_sentences = split_into_sentences(source_text)
     if not input_sentences or not source_sentences:
         return {}
+    # Handle external references in source sentences
+    # This is specified for bbc news articles
     additional_sentences = []
     for sentence in source_sentences:
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
     source_sentences.extend(additional_sentences)
+    # Encode sentences into embeddings using the PARAPHASE_MODEL
+    embeddings1 = PARAPHRASE_MODEL.encode(
         input_sentences,
         convert_to_tensor=True,
         device=DEVICE,
         show_progress_bar=False,
     )
+    embeddings2 = PARAPHRASE_MODEL.encode(
         source_sentences,
         convert_to_tensor=True,
         device=DEVICE,
     # Find sentence alignments
     inputs = ""
     sources = ""
+    similarities = []
     for i, sentence in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         best_matched_sentence = source_sentences[max_sim_index]
         inputs += sentence + " "
         sources += best_matched_sentence + " "
         similarities.append(max_similarity)
+    # Calculate average similarity and determine paraphrase label
     similarity = sum(similarities) / len(similarities)
     label, is_paraphrased = determine_label(max_similarity)
+    # Create the alignment dictionary
     alignment = {
+        "input": inputs,
+        "source": sources,
+        "similarity": similarity,
+        "label": label,
+        "paraphrase": is_paraphrased,
+        "url": url,
+    }
     print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
     return alignment
+def determine_label(similarity: float) -> tuple[str | None, bool]:
     """
+    Determines a label and paraphrase status based on the similarity score.
     Args:
+        similarity (float): The similarity score between two texts.
     Returns:
+        tuple: A tuple containing the label (str or None)
+                and a boolean indicating if it's a paraphrase.
     """
     if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
+        return "HUMAN", True  # Human paraphrase
     elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
+        return "MACHINE", True  # Machine paraphrase
     else:
+        return None, False  # Not a paraphrase
 if __name__ == "__main__":

test.py CHANGED Viewed

@@ -1,74 +1,3 @@
-import re
-def is_newline_after_text(text1, text2):
-    """
-    Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
-    Args:
-        text1: The text to search for.
-        text2: The text to search within.
-    Returns:
-        A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
-    """
-    match = re.search(re.escape(text1), text2)  #escape text1 to handle special characters
-    if match:
-        # Find the next non-space character
-        next_char_index = match.end()
-        while next_char_index < len(text2) and text2[next_char_index].isspace():
-            next_char_index += 1
-        if text2[next_char_index:next_char_index+2] == r'\n':
-            print("newline found")
-        if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
-            return True
-    return False
-def is_newline_after_text_2(text1, text2):
-    """
-    Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
-    Args:
-        text1: The text to search for.
-        text2: The text to search within.
-    Returns:
-       True if next char is newline
-    """
-    text2 = text2.replace("\n", "\\n")
-    ater_text = text2.split(text1)
-    if len(ater_text) > 1:
-        ater_text = ater_text[1].lstrip()  # Remove spaces
-        if ater_text.startswith('\n'):
-            return True
-    return False
-# Example usage:
-text1 = "hello"
-text2 = "some text hello \nmore text"
-result = is_newline_after_text_2(text1, text2)
-print(f"Next char is newline: {result}\n")
-text1 = "hello"
-text2 = "some text hello more text"
-result = is_newline_after_text_2(text1, text2)
-print(f"Next char is newline: {result}\n")
-text1 = "hello"
-text2 = "some text hello   \nmore text"
-result = is_newline_after_text_2(text1, text2)
-print(f"Next char is newline: {result}\n")
-text1 = "hello"
-text2 = "some text hello\t\nmore text" #test tab space before newline
-result = is_newline_after_text_2(text1, text2)
-print(f"Next char is newline: {result}\n")
-text1 = "hello." #test special characters
-text2 = "some text hello. \nmore text"
-result = is_newline_after_text_2(text1, text2)
-print(f"Next char is newline: {result}\n")

+a = [1, 2]
+a.append(None)
+print(a)