Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Mar 12

Commit

00b1038

1 Parent(s): e58707f

refactor code + fix bug of label after grouping url

Browse files

Files changed (18) hide show

application_2.py +0 -0
gpt_test.py +4 -3
requirements.txt +2 -0
src/application/config.py +6 -0
src/application/content_detection.py +11 -4
src/application/content_generation.py +1 -1
src/application/formatting_fact_checker.py +77 -61
src/application/formatting_governor.py +36 -26
src/application/formatting_ordinary_user.py +36 -9
src/application/image/helper.py +12 -0
src/application/image/image.py +1 -0
src/application/text/ai_classification.py +86 -0
src/application/text/helper.py +38 -8
src/application/text/model_detection.py +26 -3
src/application/text/search_detection.py +34 -17
src/application/text/text.py +2 -0
src/application/url_reader.py +1 -1
test.py +30 -18

application_2.py DELETED Viewed

File without changes

gpt_test.py CHANGED Viewed

@@ -91,9 +91,10 @@ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
 AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
 azure_client = AzureOpenAI(
-    azure_endpoint="https://quoc-nguyen.openai.azure.com/",
     api_key=AZURE_OPENAI_API_KEY,
-    api_version="2024-05-01-preview",
 )
 deplopment_name = "gpt-4o"  # or "gpt-4o-mini"  # "o1-mini"  # or "gpt-4o"
@@ -127,4 +128,4 @@ for index, news in enumerate(text):
     count += 1
     paraphrased_news = response.choices[0].message.content
-    add_text_to_csv("data/MAGE_2_4o.csv", paraphrased_news, count)

 AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
 azure_client = AzureOpenAI(
+    azure_endpoint=AZURE_OPENAI_ENDPOINT,
     api_key=AZURE_OPENAI_API_KEY,
+    api_version=AZURE_OPENAI_API_VERSION,
+    # API VERSION=[ 2024-12-01-preview, 2024-05-01-preview]
 )
 deplopment_name = "gpt-4o"  # or "gpt-4o-mini"  # "o1-mini"  # or "gpt-4o"
     count += 1
     paraphrased_news = response.choices[0].message.content
+    add_text_to_csv("data/test.csv", paraphrased_news, count)

requirements.txt CHANGED Viewed

@@ -17,8 +17,10 @@ scikit-learn
 nltk
 numpy
 torch
 sentence-transformers
 accelerate
 # Images
 pillow==10.1.0

 nltk
 numpy
 torch
+tokenizers
 sentence-transformers
 accelerate
+sentencepiece
 # Images
 pillow==10.1.0

src/application/config.py CHANGED Viewed

@@ -43,6 +43,9 @@ PARAPHRASE_MODEL.to(DEVICE)
 # Model to detect AI-generated text
 AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
 # Thresholds
 PARAPHRASE_THRESHOLD_HUMAN = 0.963
 PARAPHRASE_THRESHOLD_MACHINE = 0.8
@@ -89,3 +92,6 @@ ENTITY_BRIGHTNESS = 0.75  # color's brightness.
 # HTML formatting
 WORD_BREAK = "word-break: break-all;"

 # Model to detect AI-generated text
 AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
+# Model to classify AI-generated text
+AI_TEXT_CLASSIFICATION_MODEL = "ductuan024/gpts-detector"
 # Thresholds
 PARAPHRASE_THRESHOLD_HUMAN = 0.963
 PARAPHRASE_THRESHOLD_MACHINE = 0.8
 # HTML formatting
 WORD_BREAK = "word-break: break-all;"
+# Prefix for output MACHINE label of text
+PREFIX = "Partially generated by "

src/application/content_detection.py CHANGED Viewed

@@ -74,8 +74,7 @@ class NewsVerification:
             news_image (str): The url of image in news article.
         """
         # Combine title and content for a full text representation.
-        # .strip() removes leading/trailing whitespace for cleaner text.
-        self.news_text = (news_title + "\n\n" + news_content).strip()
         # if not isinstance(news_title, str) or not isinstance(
         #     news_content,
@@ -90,6 +89,9 @@ class NewsVerification:
         self.news_content = news_content
         self.news_image = news_image
     def group_by_url(self):
         """
         Groups aligned sentences by URL
@@ -138,6 +140,7 @@ class NewsVerification:
                 # Detect text origin using an AI model.
                 label, score = detect_text_by_ai_model(text)
             self.text.grouped_url_df.at[index, "label"] = label
             self.text.grouped_url_df.at[index, "score"] = score
@@ -169,6 +172,7 @@ class NewsVerification:
                     na=False,
                 )
             ]
             if not machine_label.empty:
                 # If 'gpt-4o' labels are found, post-process and assign.
@@ -185,7 +189,9 @@ class NewsVerification:
                     self.aligned_sentences_df["label"] == "HUMAN"
                 ]
                 self.text.prediction_label[0] = "HUMAN"
-                self.text.prediction_score[0] = machine_label["score"].mean()
         else:
             # If no found URLs, use AI detection on the entire input text.
             print("No source found in the input text")
@@ -306,7 +312,7 @@ class NewsVerification:
         # Check if a ratio of remaining filtering-sentences is more than 50%.
         if (
-            len(filtered_by_similarity) / len(self.aligned_sentences_df)
             > MIN_RATIO_PARAPHRASE_NUM
         ):
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
@@ -319,6 +325,7 @@ class NewsVerification:
                 )
                 .any()
             )
             # TODO: integrate with determine_text_origin
             if contains_machine:

             news_image (str): The url of image in news article.
         """
         # Combine title and content for a full text representation.
+        self.news_text = news_title + "\n\n" + news_content
         # if not isinstance(news_title, str) or not isinstance(
         #     news_content,
         self.news_content = news_content
         self.news_image = news_image
+        self.text.input = self.news_text
+        self.image.input = news_image
     def group_by_url(self):
         """
         Groups aligned sentences by URL
                 # Detect text origin using an AI model.
                 label, score = detect_text_by_ai_model(text)
+            print(f"labels = {label}")
             self.text.grouped_url_df.at[index, "label"] = label
             self.text.grouped_url_df.at[index, "score"] = score
                     na=False,
                 )
             ]
+            print(f" machine_label = {machine_label}")
             if not machine_label.empty:
                 # If 'gpt-4o' labels are found, post-process and assign.
                     self.aligned_sentences_df["label"] == "HUMAN"
                 ]
                 self.text.prediction_label[0] = "HUMAN"
+                self.text.prediction_score[0] = self.text.grouped_url_df[
+                    "score"
+                ].mean()
         else:
             # If no found URLs, use AI detection on the entire input text.
             print("No source found in the input text")
         # Check if a ratio of remaining filtering-sentences is more than 50%.
         if (
+            len(filtered_by_similarity) / len(filtered_by_url)
             > MIN_RATIO_PARAPHRASE_NUM
         ):
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
                 )
                 .any()
             )
+            print(f"contain_machine = \n{contains_machine}")
             # TODO: integrate with determine_text_origin
             if contains_machine:

src/application/content_generation.py CHANGED Viewed

@@ -86,7 +86,7 @@ def extract_title_content(fake_news: str) -> tuple[str, str]:
         title_start = fake_news.find("# Title: ") + len("# Title: ")
         title_end = fake_news.find("\n", title_start)
         if title_start != -1 and title_end != -1:
-            title = fake_news[title_start:title_end].strip()
         title_start = fake_news.find("\n# Content: ") + len(
             "\n# Content: ",

         title_start = fake_news.find("# Title: ") + len("# Title: ")
         title_end = fake_news.find("\n", title_start)
         if title_start != -1 and title_end != -1:
+            title = fake_news[title_start:title_end]  # .strip()
         title_start = fake_news.find("\n# Content: ") + len(
             "\n# Content: ",

src/application/formatting_fact_checker.py CHANGED Viewed

@@ -7,7 +7,10 @@ from src.application.formatting import (
 )
 from src.application.image.image import ImageDetector
 from src.application.text.entity import apply_highlight
-from src.application.text.helper import extract_equal_text
 from src.application.text.text import TextDetector
@@ -17,63 +20,66 @@ def create_fact_checker_table(
     image: ImageDetector,
 ):
     rows = []
-    rows.append(format_image_fact_checker_row(image))
-    for _, row in aligned_sentences_df.iterrows():
-        if row["input"] is None:
-            continue
-        if row["source"] is None:
-            equal_idx_1 = equal_idx_2 = []
-        else:  # Get index of equal phrases in input and source sentences
-            equal_idx_1, equal_idx_2 = extract_equal_text(
-                row["input"],
-                row["source"],
             )
-        text.fact_checker_table.append(
-            [
-                row,  # aligned_sentences_df
-                equal_idx_1,  # index of equal text in input
-                equal_idx_2,  # index of equal text in source
-                row["entities"],
-                row["url"],
-            ],
-        )
-    previous_url = None
-    span_row = 1
-    for index, row in enumerate(text.fact_checker_table):
-        current_url = row[4]
-        last_url_row = False
-        # First row or URL change
-        if index == 0 or current_url != previous_url:
-            first_url_row = True
-            previous_url = current_url
-            # Increase counter "span_row" when the next url is the same
-            while (
-                index + span_row < len(text.fact_checker_table)
-                and text.fact_checker_table[index + span_row][4] == current_url
-            ):
-                span_row += 1
-        else:
-            first_url_row = False
-            span_row -= 1
-        if span_row == 1:
-            last_url_row = True
-        formatted_row = format_text_fact_checker_row(
-            text,
-            row,
-            first_url_row,
-            last_url_row,
-            span_row,
-        )
-        rows.append(formatted_row)
     table = "\n".join(rows)
     return f"""
@@ -102,9 +108,9 @@ def create_fact_checker_table(
 def format_text_fact_checker_row(
     text: TextDetector,
     row: list,
-    first_url_row: bool=True,
-    last_url_row: bool=True,
-    span_row: int=1,
 ):
     entity_count = 0
     print(f"row: {row}")
@@ -158,10 +164,14 @@ def format_text_fact_checker_row(
         input_sentence = row[0]["input"]
         source_sentence = row[0]["source"]
     url = row[0]["url"]
     # Displayed label and score by url
     filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
     if len(filterby_url) > 0:
         label = filterby_url["label"].values[0]
         score = filterby_url["score"].values[0]
@@ -170,7 +180,10 @@ def format_text_fact_checker_row(
         score = text.prediction_score[0]
     # Format displayed url
-    source_text_url = f"""<a href="{url}">{url}</a>"""
     # Format displayed entity count
     entity_count_text = format_entity_count(entity_count)
@@ -220,7 +233,10 @@ def format_text_fact_checker_row(
 """
-def format_image_fact_checker_row(image):
     if image.referent_url is not None or image.referent_url != "":
         source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
         source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501

 )
 from src.application.image.image import ImageDetector
 from src.application.text.entity import apply_highlight
+from src.application.text.helper import (
+    extract_equal_text,
+    replace_leading_spaces,
+)
 from src.application.text.text import TextDetector
     image: ImageDetector,
 ):
     rows = []
+    if image.input is not None:
+        rows.append(format_image_fact_checker_row(image))
+    if text.input is not None:
+        for _, row in aligned_sentences_df.iterrows():
+            if row["input"] is None:
+                continue
+            if row["source"] is None:
+                equal_idx_1 = equal_idx_2 = []
+            else:  # Get index of equal phrases in input and source sentences
+                equal_idx_1, equal_idx_2 = extract_equal_text(
+                    row["input"],
+                    row["source"],
+                )
+            text.fact_checker_table.append(
+                [
+                    row,  # aligned_sentences_df
+                    equal_idx_1,  # index of equal text in input
+                    equal_idx_2,  # index of equal text in source
+                    row["entities"],
+                    row["url"],
+                ],
             )
+        previous_url = None
+        span_row = 1
+        for index, row in enumerate(text.fact_checker_table):
+            current_url = row[4]
+            last_url_row = False
+            # First row or URL change
+            if index == 0 or current_url != previous_url:
+                first_url_row = True
+                previous_url = current_url
+                # Increase counter "span_row" when the next url is the same
+                while (
+                    index + span_row < len(text.fact_checker_table)
+                    and text.fact_checker_table[index + span_row][4]
+                    == current_url
+                ):
+                    span_row += 1
+            else:
+                first_url_row = False
+                span_row -= 1
+            if span_row == 1:
+                last_url_row = True
+            formatted_row = format_text_fact_checker_row(
+                text,
+                row,
+                first_url_row,
+                last_url_row,
+                span_row,
+            )
+            rows.append(formatted_row)
     table = "\n".join(rows)
     return f"""
 def format_text_fact_checker_row(
     text: TextDetector,
     row: list,
+    first_url_row: bool = True,
+    last_url_row: bool = True,
+    span_row: int = 1,
 ):
     entity_count = 0
     print(f"row: {row}")
         input_sentence = row[0]["input"]
         source_sentence = row[0]["source"]
+    input_sentence = replace_leading_spaces(input_sentence)
+    source_sentence = replace_leading_spaces(source_sentence)
     url = row[0]["url"]
     # Displayed label and score by url
     filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
     if len(filterby_url) > 0:
         label = filterby_url["label"].values[0]
         score = filterby_url["score"].values[0]
         score = text.prediction_score[0]
     # Format displayed url
+    if url is None:
+        source_text_url = url
+    else:
+        source_text_url = f"""<a href="{url}">{url}</a>"""
     # Format displayed entity count
     entity_count_text = format_entity_count(entity_count)
 """
+def format_image_fact_checker_row(image: ImageDetector):
+    if image.input is None:
+        return ""
     if image.referent_url is not None or image.referent_url != "":
         source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
         source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501

src/application/formatting_governor.py CHANGED Viewed

@@ -7,7 +7,10 @@ from src.application.formatting import (
 )
 from src.application.image.image import ImageDetector
 from src.application.text.entity import apply_highlight
-from src.application.text.helper import extract_equal_text
 from src.application.text.text import TextDetector
@@ -17,32 +20,34 @@ def create_governor_table(
     image: ImageDetector,
 ):
     rows = []
-    rows.append(format_image_governor_row(image))
-    for _, row in aligned_sentences_df.iterrows():
-        if row["input"] is None:
-            continue
-        if row["source"] is None:
-            equal_idx_1 = equal_idx_2 = []
-        else:
-            # Get index of equal phrases in input and source sentences
-            equal_idx_1, equal_idx_2 = extract_equal_text(
-                row["input"],
-                row["source"],
             )
-        text.governor_table.append(
-            [
-                row,
-                equal_idx_1,
-                equal_idx_2,
-                row["entities"],
-            ],
-        )
-    formatted_row = format_text_governor_row(text)
-    rows.append(formatted_row)
     table = "\n".join(rows)
     return f"""
@@ -123,9 +128,11 @@ def format_text_governor_row(text):
                 source_sentence = row[0]["source"]
             input_sentence = row[0]["input"]
-        # convert score to HUMAN-based score:
-        input_sentences += input_sentence + "<br><br>"
-        source_sentences += source_sentence + "<br><br>"
         url = row[0]["url"]
         if url not in urls:
@@ -149,6 +156,9 @@ def format_text_governor_row(text):
 def format_image_governor_row(image):
     if image.referent_url is not None or image.referent_url != "":
         source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
         source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501

 )
 from src.application.image.image import ImageDetector
 from src.application.text.entity import apply_highlight
+from src.application.text.helper import (
+    extract_equal_text,
+    replace_leading_spaces,
+)
 from src.application.text.text import TextDetector
     image: ImageDetector,
 ):
     rows = []
+    if image.input is not None:
+        rows.append(format_image_governor_row(image))
+    if text.input is not None:
+        for _, row in aligned_sentences_df.iterrows():
+            if row["input"] is None:
+                continue
+            if row["source"] is None:
+                equal_idx_1 = equal_idx_2 = []
+            else:
+                # Get index of equal phrases in input and source sentences
+                equal_idx_1, equal_idx_2 = extract_equal_text(
+                    row["input"],
+                    row["source"],
+                )
+            text.governor_table.append(
+                [
+                    row,
+                    equal_idx_1,
+                    equal_idx_2,
+                    row["entities"],
+                ],
             )
+        formatted_row = format_text_governor_row(text)
+        rows.append(formatted_row)
     table = "\n".join(rows)
     return f"""
                 source_sentence = row[0]["source"]
             input_sentence = row[0]["input"]
+        input_sentence = replace_leading_spaces(input_sentence)
+        source_sentence = replace_leading_spaces(source_sentence)
+        input_sentences += input_sentence + "<br>"
+        source_sentences += source_sentence + "<br>"
         url = row[0]["url"]
         if url not in urls:
 def format_image_governor_row(image):
+    if image.input is None:
+        return ""
     if image.referent_url is not None or image.referent_url != "":
         source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
         source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501

src/application/formatting_ordinary_user.py CHANGED Viewed

@@ -2,6 +2,7 @@ from pandas import DataFrame
 from src.application.config import WORD_BREAK
 from src.application.image.image import ImageDetector
 from src.application.text.text import TextDetector
@@ -10,9 +11,26 @@ def create_ordinary_user_table(
     text: TextDetector,
     image: ImageDetector,
 ) -> str:
     rows = []
-    rows.append(format_image_ordinary_user_row(image))
-    rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
     table = "\n".join(rows)
     return f"""
@@ -32,8 +50,6 @@ def create_ordinary_user_table(
         {table}
     </tbody>
 </table>
-<style>
     """
@@ -41,6 +57,16 @@ def format_text_ordinary_user_row(
     aligned_sentences_df,
     text,
 ) -> str:
     input_sentences = ""
     source_text_html = ""
     urls = []
@@ -48,7 +74,7 @@ def format_text_ordinary_user_row(
         if row["input"] is None:
             continue
-        input_sentences += row["input"] + "<br><br>"
         url = row["url"]
         if url not in urls:
             urls.append(url)
@@ -66,15 +92,16 @@ def format_text_ordinary_user_row(
 def format_image_ordinary_user_row(image: ImageDetector) -> str:
     """
-    Formats an HTML table row for ordinary users,
-        displaying image analysis results.
     Args:
-        image (ImageDetector): The image to be analyzed.
     Returns:
-        str: An HTML table row string containing the image analysis results.
     """
     # Put image, label, and score into html tag
     if image.referent_url is not None or image.referent_url != "":

 from src.application.config import WORD_BREAK
 from src.application.image.image import ImageDetector
+from src.application.text.helper import replace_leading_spaces
 from src.application.text.text import TextDetector
     text: TextDetector,
     image: ImageDetector,
 ) -> str:
+    """
+    Creates an HTML table comparing input news with source news
+        for ordinary users.
+    Args:
+        aligned_sentences_df (DataFrame): Aligned sentence data.
+        text (TextDetector): Text comparison data.
+        image (ImageDetector): Image comparison data.
+    Returns:
+        A string representing the HTML table.
+    """
     rows = []
+    if image.input is not None:
+        rows.append(format_image_ordinary_user_row(image))
+    if text.input is not None:
+        rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
     table = "\n".join(rows)
     return f"""
         {table}
     </tbody>
 </table>
     """
     aligned_sentences_df,
     text,
 ) -> str:
+    """
+    Formats a row for the text in the ordinary user table.
+    Args:
+        aligned_sentences_df (DataFrame): Aligned sentence data.
+        text (TextDetector): Text comparison data.
+    Returns:
+        A string representing the HTML table row for the text.
+    """
     input_sentences = ""
     source_text_html = ""
     urls = []
         if row["input"] is None:
             continue
+        input_sentences += replace_leading_spaces(row["input"]) + "<br>"
         url = row["url"]
         if url not in urls:
             urls.append(url)
 def format_image_ordinary_user_row(image: ImageDetector) -> str:
     """
+    Formats a row for the image in the ordinary user table.
     Args:
+        image: Image comparison data.
     Returns:
+        A string representing the HTML table row for the image.
     """
+    if image.input is None:
+        return ""
     # Put image, label, and score into html tag
     if image.referent_url is not None or image.referent_url != "":

src/application/image/helper.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import base64
+def encode_image(image_path):
+    with open(image_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode()
+image_base64 = encode_image(
+    "/content/ai-generated-picture-of-a-tiger-walking-in-the-forest-photo.jpg",
+)
+html_code = f'<img src="data:image/jpeg;base64,{image_base64}" width="300">'

src/application/image/image.py CHANGED Viewed

@@ -1,5 +1,6 @@
 class ImageDetector:
     def __init__(self):
         self.referent_url: str = None  # URL of the referenced image.
         self.prediction_label: str = None
         self.prediction_score: float = None

 class ImageDetector:
     def __init__(self):
+        self.input = None
         self.referent_url: str = None  # URL of the referenced image.
         self.prediction_label: str = None
         self.prediction_score: float = None

src/application/text/ai_classification.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import (
+    Dict,
+    List,
+    Tuple,
+)
+import torch
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)
+from src.application.config import AI_TEXT_CLASSIFICATION_MODEL
+def load_model_and_tokenizer(
+    model_path: str = AI_TEXT_CLASSIFICATION_MODEL,
+) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]:
+    """
+    Loads the trained model and tokenizer from the specified path.
+    Args:
+        model_path: path of directory containing the saved model and tokenizer.
+    Returns:
+        A tuple containing the loaded tokenizer and model.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    model.eval()
+    return tokenizer, model
+def predict(
+    texts: List[str],
+    model: AutoModelForSequenceClassification,
+    tokenizer: AutoTokenizer,
+) -> List[Dict[str, str]]:
+    """
+    Classify on input texts into gpt-4o or gpt-4o-mini.
+    Args:
+        texts: A list of input text strings to be classified.
+        model: The loaded model for sequence classification.
+        tokenizer: The loaded tokenizer.
+    Returns:
+        A list of dictionaries, where each dictionary contains the input text,
+        the predicted label, and the confidence score.
+    """
+    label_map = {0: "GPT-4o", 1: "GPT-4o mini"}
+    inputs = tokenizer(
+        texts,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    confidence, predictions = torch.max(probabilities, dim=-1)
+    results = []
+    for text, pred, conf in zip(
+        texts,
+        predictions.tolist(),
+        confidence.tolist(),
+    ):
+        results.append(
+            {"input": text, "prediction": label_map[pred], "confidence": conf},
+        )
+    return results
+if __name__ == "__main__":
+    text = """The resignation brings a long political chapter to an end.
+Trudeau has been in office since 2015, when he brought the Liberals back
+to power from the political wilderness.
+"""
+    tokenizer, model = load_model_and_tokenizer("ductuan024/gpts-detector")
+    predictions = predict(text, model, tokenizer)
+    print(predictions[0]["prediction"])
+    print(predictions[0]["confidence"])

src/application/text/helper.py CHANGED Viewed

@@ -15,6 +15,8 @@ from nltk.tokenize import (
 from nltk.util import ngrams
 from sklearn.feature_extraction.text import TfidfVectorizer
 def clean_text(text: str) -> str:
     """
@@ -122,9 +124,7 @@ def get_important_sentences(
         list: A list of important sentences.
     """
     # Clean and split the sentence into sentences
-    sentences = [
-        s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
-    ]
     # Calculate the importance score for each sentence
     sentence_scores = []
@@ -293,13 +293,16 @@ def postprocess_label(labels: list[str]) -> str:
     Returns:
         A string with the formatted label.
     """
-    prefix = "Partially generated by "
     for index, label in enumerate(labels):
-        if label.startswith(prefix):
-            labels[index] = label[len(prefix) :]
     labels = list(set(labels))
-    label = prefix
     if len(labels) == 1:
         label += labels[0]
@@ -362,7 +365,7 @@ def split_into_paragraphs(input_text: str) -> list[str]:
     for paragraph in paragraphs:
         # Remove leading/trailing whitespace
-        paragraph = paragraph.strip()
         if paragraph and paragraph != "\n":
             # Append the cleaned paragraph to the output list.
@@ -460,6 +463,33 @@ def filter_indices(
     return filtered_starts, filtered_ends
 def extract_new_startend(
     start: int,
     end: int,

 from nltk.util import ngrams
 from sklearn.feature_extraction.text import TfidfVectorizer
+from src.application.config import PREFIX
 def clean_text(text: str) -> str:
     """
         list: A list of important sentences.
     """
     # Clean and split the sentence into sentences
+    sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s]
     # Calculate the importance score for each sentence
     sentence_scores = []
     Returns:
         A string with the formatted label.
     """
     for index, label in enumerate(labels):
+        # if label.startswith(PREFIX):
+        #     labels[index] = label[len(PREFIX) :]
+        if PREFIX in label:
+            labels[index] = label.replace(PREFIX, "")
     labels = list(set(labels))
+    label = ""
     if len(labels) == 1:
         label += labels[0]
     for paragraph in paragraphs:
         # Remove leading/trailing whitespace
+        # paragraph = paragraph.strip()
         if paragraph and paragraph != "\n":
             # Append the cleaned paragraph to the output list.
     return filtered_starts, filtered_ends
+def replace_leading_spaces(text: str) -> str:
+    """
+    Replaces leading spaces in a string with '&nbsp;'.
+    Args:
+        text: The input string.
+    Returns:
+        The string with leading spaces replaced by '&nbsp;'.
+    """
+    if text is None:
+        return None
+    leading_spaces = 0
+    for char in text:
+        if char == " ":
+            leading_spaces += 1
+        else:
+            break
+    if leading_spaces > 0:
+        return "&nbsp;" * leading_spaces + text[leading_spaces:]
+    else:
+        return text
 def extract_new_startend(
     start: int,
     end: int,

src/application/text/model_detection.py CHANGED Viewed

@@ -15,8 +15,13 @@ from src.application.config import (
     HUMAN,
     MODEL_HUMAN_LABEL,
     PARAPHRASE_MODEL,
     UNKNOWN,
 )
 def detect_text_by_ai_model(
@@ -63,7 +68,7 @@ def detect_text_by_ai_model(
         else:
             # label = MACHINE
             generated_model, _ = predict_generation_model(input_text)
-            label = f"Partially generated by {generated_model}"
         return label, confidence_score
@@ -75,6 +80,24 @@ def detect_text_by_ai_model(
 def predict_generation_model(text: str) -> tuple[str, float]:
     """
     Predicts if text is generated by gpt-4o or gpt-4o-mini models.
     Compares the input text against the paraphrased text by the models.
     Args:
@@ -82,8 +105,8 @@ def predict_generation_model(text: str) -> tuple[str, float]:
     Returns:
         tuple: (label, confidence_score)
-               where label is gpt-4o or gpt-4o-mini,
-               and confidence_score is the highest similarity.
     """
     best_similarity = 0
     best_model = GPT_PARAPHRASE_MODELS[0]

     HUMAN,
     MODEL_HUMAN_LABEL,
     PARAPHRASE_MODEL,
+    PREFIX,
     UNKNOWN,
 )
+from src.application.text.ai_classification import (
+    load_model_and_tokenizer,
+    predict,
+)
 def detect_text_by_ai_model(
         else:
             # label = MACHINE
             generated_model, _ = predict_generation_model(input_text)
+            label = f"{PREFIX}{generated_model}"
         return label, confidence_score
 def predict_generation_model(text: str) -> tuple[str, float]:
     """
     Predicts if text is generated by gpt-4o or gpt-4o-mini models.
+    Args:
+        text (str): The input text to be analyzed.
+    Returns:
+        tuple: (label, confidence_score)
+                where label is gpt-4o or gpt-4o-mini,
+                and confidence_score is the highest similarity.
+    """
+    tokenizer, model = load_model_and_tokenizer()
+    predictions = predict(text, model, tokenizer)
+    return predictions[0]["prediction"], predictions[0]["confidence"]
+def predict_generation_model_by_reparaphrasing(text: str) -> tuple[str, float]:
+    """
+    Predicts if text is generated by gpt-4o or gpt-4o-mini models.
     Compares the input text against the paraphrased text by the models.
     Args:
     Returns:
         tuple: (label, confidence_score)
+                where label is gpt-4o or gpt-4o-mini,
+                and confidence_score is the highest similarity.
     """
     best_similarity = 0
     best_model = GPT_PARAPHRASE_MODELS[0]

src/application/text/search_detection.py CHANGED Viewed

@@ -3,8 +3,8 @@ Author: Khanh Phan
 Date: 2024-12-04
 """
-from typing import Optional
 import warnings
 import numpy as np
 from pandas import DataFrame
@@ -14,6 +14,7 @@ from src.application.config import (
     DEVICE,
     MAX_CHAR_SIZE,
     PARAPHRASE_MODEL,
     PARAPHRASE_THRESHOLD_HUMAN,
     PARAPHRASE_THRESHOLD_MACHINE,
     TOP_URLS_PER_SEARCH,
@@ -96,15 +97,22 @@ def find_sentence_source(
                     )
                     return sentences_df, []
-                # assign values
-                columns = [
-                    "input",
-                    "source",
-                    "label",
-                    "similarity",
-                    "paraphrase",
-                    "url",
-                ]
                 for c in columns:
                     if c in sentences_df.columns:
                         sentences_df.loc[text_index, c] = aligned_sentence[c]
@@ -126,13 +134,22 @@ def find_sentence_source(
                         similarity is None
                         or aligned_sentence["similarity"] > similarity
                     ):
-                        columns = [
-                            "input",
-                            "source",
-                            "label",
-                            "similarity",
-                            "url",
-                        ]
                         for c in columns:
                             if c in sentences_df.columns:
                                 sentences_df.loc[idx, c] = aligned_sentence[c]

 Date: 2024-12-04
 """
 import warnings
+from typing import Optional
 import numpy as np
 from pandas import DataFrame
     DEVICE,
     MAX_CHAR_SIZE,
     PARAPHRASE_MODEL,
+    PARAPHRASE_THRESHOLD,
     PARAPHRASE_THRESHOLD_HUMAN,
     PARAPHRASE_THRESHOLD_MACHINE,
     TOP_URLS_PER_SEARCH,
                     )
                     return sentences_df, []
+                if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD:
+                    columns = [
+                        "input",
+                        "source",
+                        "label",
+                        "similarity",
+                        "paraphrase",
+                        "url",
+                    ]
+                else:
+                    columns = [
+                        "input",
+                        "label",
+                        "paraphrase",
+                    ]
                 for c in columns:
                     if c in sentences_df.columns:
                         sentences_df.loc[text_index, c] = aligned_sentence[c]
                         similarity is None
                         or aligned_sentence["similarity"] > similarity
                     ):
+                        if (
+                            aligned_sentence["similarity"]
+                            > PARAPHRASE_THRESHOLD
+                        ):
+                            columns = [
+                                "input",
+                                "source",
+                                "label",
+                                "similarity",
+                                "url",
+                            ]
+                        else:
+                            columns = [
+                                "input",
+                                "label",
+                            ]
                         for c in columns:
                             if c in sentences_df.columns:
                                 sentences_df.loc[idx, c] = aligned_sentence[c]

src/application/text/text.py CHANGED Viewed

@@ -3,6 +3,8 @@ import pandas as pd
 class TextDetector:
     def __init__(self):
         self.prediction_label: list[str] = ["UNKNOWN"]
         self.prediction_score: list[float] = [0.0]

 class TextDetector:
     def __init__(self):
+        self.input = None
         self.prediction_label: list[str] = ["UNKNOWN"]
         self.prediction_score: list[float] = [0.0]

src/application/url_reader.py CHANGED Viewed

@@ -78,7 +78,7 @@ class URLReader:
             soup = BeautifulSoup(response.content, "html.parser")
-            self.title = soup.title.string.strip() if soup.title else None
             image_urls = [img["src"] for img in soup.find_all("img")]
             self.images = image_urls

             soup = BeautifulSoup(response.content, "html.parser")
+            self.title = soup.title.string if soup.title else None
             image_urls = [img["src"] for img in soup.find_all("img")]
             self.images = image_urls

test.py CHANGED Viewed

@@ -1,27 +1,39 @@
-def postprocess_label(labels: list[str]) -> str:
     """
-    Creates a label string with the format
-    "Partially generated by [label1] and [label2] and ...".
-    Removes duplicate labels while preserving the original order.
     Args:
-        labels: A list of strings representing labels.
     Returns:
-        A string with the formatted label.
     """
-    labels = list(set(labels))
-    label = "Partially generated by "
-    if len(label) == 1:
-        label += labels[0]
-    elif len(labels) == 2:
-        label += f"{labels[0]} and {labels[1]}"
     else:
-        combination = ", ".join(labels[0 : len(labels) - 1])
-        label += f"{combination}, and {labels[-1]}"
-    return label
-labels = ["gpt-4o", "gpt-4o-mini", "gpt-4o-l"]
-postprocessed_label = postprocess_label(labels)
-print(postprocessed_label)

+def replace_leading_spaces(text):
     """
+    Replaces leading spaces in a string with '&nbsp;'.
     Args:
+        text: The input string.
     Returns:
+        The string with leading spaces replaced by '&nbsp;'.
     """
+    leading_spaces = 0
+    for char in text:
+        if char == " ":
+            leading_spaces += 1
+        else:
+            break
+    if leading_spaces > 0:
+        return "&nbsp;" * leading_spaces + text[leading_spaces:]
     else:
+        return text
+# Example usage:
+text1 = "   Hello, world!"
+text2 = "No leading spaces."
+text3 = "      Another example."
+text4 = "\t Test with tabs"  # this will not be replaced, only standard spaces
+result1 = replace_leading_spaces(text1)
+result2 = replace_leading_spaces(text2)
+result3 = replace_leading_spaces(text3)
+result4 = replace_leading_spaces(text4)
+print(f"'{text1}' becomes '{result1}'")
+print(f"'{text2}' becomes '{result2}'")
+print(f"'{text3}' becomes '{result3}'")
+print(f"'{text4}' becomes '{result4}'")