Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 19

Commit

62dc9d8

1 Parent(s): 66396a8

add AI tools; replace text baseline model; add Quoc's new algorithm

Browse files

Files changed (11) hide show

application.py +15 -5
application_2.py +0 -254
application_3.py +0 -254
gpt_test.py +3 -3
src/application/content_detection.py +331 -223
src/application/text/entity.py +5 -4
src/application/text/helper.py +2 -3
src/application/text/model_detection.py +111 -5
src/application/text/search.py +0 -1
src/application/text/search_detection.py +22 -122
test.py +2 -13

application.py CHANGED Viewed

@@ -135,17 +135,27 @@ between the input text and the source.
         <thead>
             <tr>
                 <th>Input news</th>
-                <th>Source (corresponding URL provided in Originality)</th>
                 <th>Forensic</th>
                 <th>Originality</th>
             </tr>
         </thead>
         <tbody>
             <tr>
-                <th>TBD</th>
-                <th>TBD</th>
-                <th>TBD</th>
-                <th>TBD</th>
             </tr>
         </tbody>
     </table>

         <thead>
             <tr>
                 <th>Input news</th>
+                <th>Source (URL in Originality)</th>
                 <th>Forensic</th>
                 <th>Originality</th>
             </tr>
         </thead>
         <tbody>
             <tr>
+                <td style="border-bottom: 1px solid transparent";>TBD</td>
+                <td style="border-bottom: 1px solid transparent";>TBD</td>
+                <td rowspan="2">TBD</td>
+                <td rowspan="2">TBD</td>
+            </tr>
+            <tr>
+                <td style="border-top: 1px solid transparent";>TBD</td>
+                <td style="border-top: 1px solid transparent";>TBD</td>
+            </tr>
+            <tr>
+                <td>TBD</td>
+                <td>TBD</td>
+                <td>TBD</td>
+                <td>TBD</td>
             </tr>
         </tbody>
     </table>

application_2.py CHANGED Viewed

@@ -1,254 +0,0 @@
-import gradio as gr
-import requests
-from PIL import Image
-from src.application.content_detection import NewsVerification
-from src.application.content_generation import (
-    generate_fake_image,
-    generate_fake_text,
-    replace_text,
-)
-from src.application.url_reader import URLReader
-AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
-AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
-def load_url(url):
-    """
-    Load content from the given URL.
-    """
-    content = URLReader(url)
-    image = None
-    header = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",  # noqa: E501
-    }
-    try:
-        response = requests.get(
-            url,
-            headers=header,
-            stream=True,
-        )
-        response.raise_for_status()  # Raise an exception for bad status codes
-        image_response = requests.get(content.top_image, stream=True)
-        try:
-            image = Image.open(image_response.raw)
-        except OSError as e:
-            print(f"Error loading image from {content.top_image}: {e}")
-    except (requests.exceptions.RequestException, FileNotFoundError) as e:
-        print(f"Error fetching image: {e}")
-    return content.title, content.text, image
-def generate_analysis_report(
-    news_title: str,
-    news_content: str,
-    news_image: Image,
-):
-    news_analysis = NewsVerification()
-    news_analysis.load_news(news_title, news_content, news_image)
-    news_analysis.generate_analysis_report()
-    return news_analysis.analyze_details()
-# Define the GUI
-with gr.Blocks() as demo:
-    gr.Markdown("# NEWS VERIFICATION")
-    with gr.Row():
-        # SETTINGS
-        with gr.Column(scale=1):
-            with gr.Accordion("1. Enter a URL"):
-                url_input = gr.Textbox(
-                    label="",
-                    show_label=False,
-                    value="",
-                )
-                load_button = gr.Button("Load URL")
-            with gr.Accordion(
-                "2. Select content-generation models",
-                open=True,
-                visible=False,
-            ):
-                with gr.Row():
-                    text_generation_model = gr.Dropdown(
-                        choices=AZURE_TEXT_MODEL,
-                        label="Text-generation model",
-                    )
-                    image_generation_model = gr.Dropdown(
-                        choices=AZURE_IMAGE_MODEL,
-                        label="Image-generation model",
-                    )
-                    generate_text_button = gr.Button("Generate text")
-                    generate_image_button = gr.Button("Generate image")
-            with gr.Accordion(
-                "3. Replace any terms",
-                open=True,
-                visible=False,
-            ):
-                replace_df = gr.Dataframe(
-                    headers=["Find what:", "Replace with:"],
-                    datatype=["str", "str"],
-                    row_count=(1, "dynamic"),
-                    col_count=(2, "fixed"),
-                    interactive=True,
-                )
-                replace_button = gr.Button("Replace all")
-            # GENERATED CONTENT
-            with gr.Accordion("Input News"):
-                news_title = gr.Textbox(label="Title", value="")
-                news_image = gr.Image(label="Image", type="filepath")
-                news_content = gr.Textbox(label="Content", value="", lines=13)
-        # NEWS ANALYSIS REPORT
-        ordinary_user_explanation = """
-FOR ORDINARY USER<br>
-- Green texts are the matched words in the input and source news.<br>
-- Each highlighted pair (marked with a number) shows the key differences
-between the input text and the source.
-        """
-        fact_checker_explanation = """
-FOR FACT CHECKER<br>
-- Green texts are the matched words in the input and source news.<br>
-- Each highlighted pair (marked with a number) shows the key differences
-between the input text and the source.
-        """
-        governor_explanation = """
-FOR GOVERNOR<br>
-- Green texts are the matched words in the input and source news.<br>
-- Each highlighted pair (marked with a number) shows the key differences
-between the input text and the source.
-        """
-        table = """
-<h5>Comparison between input news and source news:</h5>
-    <table border="1" style="width:100%; text-align:left;">
-    <col style="width: 170px;">
-    <col style="width: 170px;">
-    <col style="width: 30px;">
-    <col style="width: 75px;">
-        <thead>
-            <tr>
-                <th>Input news</th>
-                <th>Source (corresponding URL provided in Originality)</th>
-                <th>Forensic</th>
-                <th>Originality</th>
-            </tr>
-        </thead>
-        <tbody>
-            <tr>
-                <th>TBD</th>
-                <th>TBD</th>
-                <th>TBD</th>
-                <th>TBD</th>
-            </tr>
-        </tbody>
-    </table>
-    <style>"""
-        with gr.Column(scale=2):
-            with gr.Accordion("NEWS ANALYSIS"):
-                verification_button = gr.Button("Verify news")
-                with gr.Tab("Orinary User"):
-                    gr.HTML(ordinary_user_explanation)
-                    ordinary_user_result = gr.HTML(table)
-                with gr.Tab("Fact Checker"):
-                    gr.HTML(fact_checker_explanation)
-                    fact_checker_result = gr.HTML(table)
-                with gr.Tab("Governor"):
-                    gr.HTML(governor_explanation)
-                    governor_result = gr.HTML(table)
-    # Connect events
-    load_button.click(
-        load_url,
-        inputs=url_input,
-        outputs=[news_title, news_content, news_image],
-    )
-    replace_button.click(
-        replace_text,
-        inputs=[news_title, news_content, replace_df],
-        outputs=[news_title, news_content],
-    )
-    generate_text_button.click(
-        generate_fake_text,
-        inputs=[text_generation_model, news_title, news_content],
-        outputs=[news_title, news_content],
-    )
-    generate_image_button.click(
-        generate_fake_image,
-        inputs=[image_generation_model, news_title],
-        outputs=[news_image],
-    )
-    verification_button.click(
-        generate_analysis_report,
-        inputs=[news_title, news_content, news_image],
-        outputs=[ordinary_user_result, fact_checker_result, governor_result],
-    )
-    # change Image
-    # url_input.change(load_image, inputs=url_input, outputs=image_view)
-    try:
-        with open(
-            "examples/example_text_real.txt",
-            encoding="utf-8",
-        ) as file:
-            text_real_1 = file.read()
-        with open(
-            "examples/example_text_real_2.txt",
-            encoding="utf-8",
-        ) as file:
-            text_real_2 = file.read()
-        with open(
-            "examples/example_text_LLM_topic.txt",
-            encoding="utf-8",
-        ) as file:
-            text_llm_topic = file.read()
-        with open(
-            "examples/example_text_LLM_modification.txt",
-            encoding="utf-8",
-        ) as file:
-            text_llm_modification = file.read()
-        with open(
-            "examples/example_text_LLM_entities.txt",
-            encoding="utf-8",
-        ) as file:
-            text_llm_entities = file.read()
-    except FileNotFoundError:
-        print("File not found.")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-    title_1 = "Southampton news: Leeds target striker Cameron Archer."
-    title_2 = "Southampton news: Leeds target striker Cameron Archer."
-    title_4 = "Japan pledges support for Ukraine with 100-year pact."
-    image_1 = "examples/example_image_real_1.jpg.webp"
-    image_2 = "examples/example_image_real_2.jpg.webp"
-    image_3 = "examples/example_image_real_3.jpg"
-    image_4 = "examples/example_image_real_4.jpg.webp"
-    gr.Examples(
-        examples=[
-            [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
-            [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
-            [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
-            [title_4, image_4, text_llm_entities],
-        ],
-        inputs=[news_title, news_image, news_content],
-        label="Examples",
-        example_labels=[
-            "2 real news",
-            "1 real news + 1 LLM modification-based news",
-            "1 real news + 1 LLM topic-based news",
-            "1 LLM changed-entities news",
-        ],
-    )
-demo.launch(share=True)

application_3.py DELETED Viewed

@@ -1,254 +0,0 @@
-import gradio as gr
-import requests
-from PIL import Image
-from src.application.content_detection import NewsVerification
-from src.application.content_generation import (
-    generate_fake_image,
-    generate_fake_text,
-    replace_text,
-)
-from src.application.url_reader import URLReader
-AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
-AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
-def load_url(url):
-    """
-    Load content from the given URL.
-    """
-    content = URLReader(url)
-    image = None
-    header = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",  # noqa: E501
-    }
-    try:
-        response = requests.get(
-            url,
-            headers=header,
-            stream=True,
-        )
-        response.raise_for_status()  # Raise an exception for bad status codes
-        image_response = requests.get(content.top_image, stream=True)
-        try:
-            image = Image.open(image_response.raw)
-        except OSError as e:
-            print(f"Error loading image from {content.top_image}: {e}")
-    except (requests.exceptions.RequestException, FileNotFoundError) as e:
-        print(f"Error fetching image: {e}")
-    return content.title, content.text, image
-def generate_analysis_report(
-    news_title: str,
-    news_content: str,
-    news_image: Image,
-):
-    news_analysis = NewsVerification()
-    news_analysis.load_news(news_title, news_content, news_image)
-    news_analysis.generate_analysis_report()
-    return news_analysis.analyze_details()
-# Define the GUI
-with gr.Blocks() as demo:
-    gr.Markdown("# NEWS VERIFICATION")
-    with gr.Row():
-        # SETTINGS
-        with gr.Column(scale=1):
-            with gr.Accordion("1. Enter a URL"):
-                url_input = gr.Textbox(
-                    label="",
-                    show_label=False,
-                    value="",
-                )
-                load_button = gr.Button("Load URL")
-            with gr.Accordion(
-                "2. Select content-generation models",
-                open=True,
-                visible=False,
-            ):
-                with gr.Row():
-                    text_generation_model = gr.Dropdown(
-                        choices=AZURE_TEXT_MODEL,
-                        label="Text-generation model",
-                    )
-                    image_generation_model = gr.Dropdown(
-                        choices=AZURE_IMAGE_MODEL,
-                        label="Image-generation model",
-                    )
-                    generate_text_button = gr.Button("Generate text")
-                    generate_image_button = gr.Button("Generate image")
-            with gr.Accordion(
-                "3. Replace any terms",
-                open=True,
-                visible=False,
-            ):
-                replace_df = gr.Dataframe(
-                    headers=["Find what:", "Replace with:"],
-                    datatype=["str", "str"],
-                    row_count=(1, "dynamic"),
-                    col_count=(2, "fixed"),
-                    interactive=True,
-                )
-                replace_button = gr.Button("Replace all")
-            # GENERATED CONTENT
-            with gr.Accordion("Input News"):
-                news_title = gr.Textbox(label="Title", value="")
-                news_image = gr.Image(label="Image", type="filepath")
-                news_content = gr.Textbox(label="Content", value="", lines=13)
-        # NEWS ANALYSIS REPORT
-        ordinary_user_explanation = """
-FOR ORDINARY USER<br>
-- Green texts are the matched words in the input and source news.<br>
-- Each highlighted pair (marked with a number) shows the key differences
-between the input text and the source.
-        """
-        fact_checker_explanation = """
-FOR FACT CHECKER<br>
-- Green texts are the matched words in the input and source news.<br>
-- Each highlighted pair (marked with a number) shows the key differences
-between the input text and the source.
-        """
-        governor_explanation = """
-FOR GOVERNOR<br>
-- Green texts are the matched words in the input and source news.<br>
-- Each highlighted pair (marked with a number) shows the key differences
-between the input text and the source.
-        """
-        table = """
-<h5>Comparison between input news and source news:</h5>
-    <table border="1" style="width:100%; text-align:left;">
-    <col style="width: 170px;">
-    <col style="width: 170px;">
-    <col style="width: 30px;">
-    <col style="width: 75px;">
-        <thead>
-            <tr>
-                <th>Input news</th>
-                <th>Source (corresponding URL provided in Originality)</th>
-                <th>Forensic</th>
-                <th>Originality</th>
-            </tr>
-        </thead>
-        <tbody>
-            <tr>
-                <th>TBD</th>
-                <th>TBD</th>
-                <th>TBD</th>
-                <th>TBD</th>
-            </tr>
-        </tbody>
-    </table>
-    <style>"""
-        with gr.Column(scale=2):
-            with gr.Accordion("NEWS ANALYSIS"):
-                verification_button = gr.Button("Verify news")
-                with gr.Tab("Orinary User"):
-                    gr.HTML(ordinary_user_explanation)
-                    ordinary_user_result = gr.HTML(table)
-                with gr.Tab("Fact Checker"):
-                    gr.HTML(fact_checker_explanation)
-                    fact_checker_result = gr.HTML(table)
-                with gr.Tab("Governor"):
-                    gr.HTML(governor_explanation)
-                    governor_result = gr.HTML(table)
-    # Connect events
-    load_button.click(
-        load_url,
-        inputs=url_input,
-        outputs=[news_title, news_content, news_image],
-    )
-    replace_button.click(
-        replace_text,
-        inputs=[news_title, news_content, replace_df],
-        outputs=[news_title, news_content],
-    )
-    generate_text_button.click(
-        generate_fake_text,
-        inputs=[text_generation_model, news_title, news_content],
-        outputs=[news_title, news_content],
-    )
-    generate_image_button.click(
-        generate_fake_image,
-        inputs=[image_generation_model, news_title],
-        outputs=[news_image],
-    )
-    verification_button.click(
-        generate_analysis_report,
-        inputs=[news_title, news_content, news_image],
-        outputs=[ordinary_user_result, fact_checker_result, governor_result],
-    )
-    # change Image
-    # url_input.change(load_image, inputs=url_input, outputs=image_view)
-    try:
-        with open(
-            "examples/example_text_real.txt",
-            encoding="utf-8",
-        ) as file:
-            text_real_1 = file.read()
-        with open(
-            "examples/example_text_real_2.txt",
-            encoding="utf-8",
-        ) as file:
-            text_real_2 = file.read()
-        with open(
-            "examples/example_text_LLM_topic.txt",
-            encoding="utf-8",
-        ) as file:
-            text_llm_topic = file.read()
-        with open(
-            "examples/example_text_LLM_modification.txt",
-            encoding="utf-8",
-        ) as file:
-            text_llm_modification = file.read()
-        with open(
-            "examples/example_text_LLM_entities.txt",
-            encoding="utf-8",
-        ) as file:
-            text_llm_entities = file.read()
-    except FileNotFoundError:
-        print("File not found.")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-    title_1 = "Southampton news: Leeds target striker Cameron Archer."
-    title_2 = "Southampton news: Leeds target striker Cameron Archer."
-    title_4 = "Japan pledges support for Ukraine with 100-year pact."
-    image_1 = "examples/example_image_real_1.jpg.webp"
-    image_2 = "examples/example_image_real_2.jpg.webp"
-    image_3 = "examples/example_image_real_3.jpg"
-    image_4 = "examples/example_image_real_4.jpg.webp"
-    gr.Examples(
-        examples=[
-            [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
-            [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
-            [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
-            [title_4, image_4, text_llm_entities],
-        ],
-        inputs=[news_title, news_image, news_content],
-        label="Examples",
-        example_labels=[
-            "2 real news",
-            "1 real news + 1 LLM modification-based news",
-            "1 real news + 1 LLM topic-based news",
-            "1 LLM changed-entities news",
-        ],
-    )
-demo.launch(share=True)

gpt_test.py CHANGED Viewed

@@ -76,12 +76,12 @@ azure_client = AzureOpenAI(
     api_version="2024-05-01-preview",
 )
-deplopment_name = "gpt-4o" # "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 Paraphrase the following news, only output the paraphrased text:
 """
-text = get_first_column("data/bbc_news.csv")
 count  = 0
 for index, news in enumerate(text):
     if count > 1000:
@@ -107,4 +107,4 @@ for index, news in enumerate(text):
     count += 1
     paraphrased_news = response.choices[0].message.content
-    add_text_to_csv("data/bbc_news_4o.csv", paraphrased_news, count)

     api_version="2024-05-01-preview",
 )
+deplopment_name = "gpt-4o-mini" # "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 Paraphrase the following news, only output the paraphrased text:
 """
+text = get_first_column("data/MAGE.csv")
 count  = 0
 for index, news in enumerate(text):
     if count > 1000:
     count += 1
     paraphrased_news = response.choices[0].message.content
+    add_text_to_csv("data/MAGE_4o_mini.csv", paraphrased_news, count)

src/application/content_detection.py CHANGED Viewed

@@ -13,11 +13,10 @@ from src.application.text.entity import (
     highlight_entities,
 )
 from src.application.text.helper import extract_equal_text
-from src.application.text.model_detection import detect_text_by_ai_model
 from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search_detection import (
-    check_human,
-    detect_text_by_relative_search,
     find_paragraph_source,
 )
@@ -29,18 +28,21 @@ class NewsVerification:
         self.news_content = ""
         self.news_image = ""
-        self.text_prediction_label: list[str] = []
-        self.text_prediction_score: list[float] = []
-        self.text_referent_url: list[str] = []
-        self.image_prediction_label: list[str] = []
-        self.image_prediction_score: list[str] = []
         self.image_referent_url: list[str] = []
         self.news_prediction_label = ""
         self.news_prediction_score = -1
         self.found_img_url: list[str] = []
-        self.aligned_sentences: list[dict] = []
-        self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
             columns=[
                 "input",
                 "source",
@@ -52,32 +54,78 @@ class NewsVerification:
                 "entities",
             ],
         )
-        self.is_paraphrased: list[bool] = []
         self.ordinary_user_table: list = []
         self.fact_checker_table: list = []
         self.governor_table: list = []
-        self.entities_with_colors = []
     def load_news(self, news_title, news_content, news_image):
-        self.news_text = news_title + "\n\n" + news_content
         self.news_title = news_title
         self.news_content = news_content
         self.news_image = news_image
     def determine_text_origin(self):
         self.find_text_source()
-        label, score = self.verify_text()
-        if label == "UNKNOWN":
-            # Concatenate text from "input" in sentence_df
-            print(self.aligned_sentences_df["input"])
-            text = " ".join(self.aligned_sentences_df["input"].tolist())
             # detect by baseline model
-            label, score = detect_text_by_ai_model(text)
-        return label, score
     def find_text_source(self):
         """
         Determines the origin of the given text based on paraphrasing detection
@@ -99,8 +147,8 @@ class NewsVerification:
         # Setup df for input_sentences
         for _ in range(len(input_sentences)):
-            self.aligned_sentences_df = pd.concat(
-                [self.aligned_sentences_df, pd.DataFrame([{
                     "input": None,
                     "source": None,
                     "label": None,
@@ -113,36 +161,63 @@ class NewsVerification:
             )
         # find a source for each paragraph
-        for index, sentence in enumerate(input_sentences):
-            if self.aligned_sentences_df.loc[index, "url"] is not None:
-                continue
-            print(f"-------index = {index}-------")
-            print(f"current_sentence = {input_sentences[index]}")
-            self.aligned_sentences_df, img_urls = find_paragraph_source(
                 input_sentences,
                 index,
-                self.aligned_sentences_df,
             )
             self.found_img_url.extend(img_urls)
         # determine if the whole source is from a news or not
-    def verify_text(self):
         # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
-        filtered_by_similarity = self.aligned_sentences_df[
-            self.aligned_sentences_df["similarity"] > 0.8
         ]
-        if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 2:
-            avg_similarity = filtered_by_similarity.similarity.mean()
-            if avg_similarity > 0.963:
-                return "HUMAN", avg_similarity
-            if avg_similarity > 0.8:
-                return "MACHINE", avg_similarity
-        return "UNKNOWN", 0.0
     def determine_image_origin(self):
         print("CHECK IMAGE:")
@@ -152,14 +227,12 @@ class NewsVerification:
             self.image_referent_url = None
             return
-        for image in self.found_img_url:
-            print(f"\tfound_img_url: {image}")
         matched_url, similarity = detect_image_from_news_image(
             self.news_image,
             self.found_img_url,
         )
         if matched_url is not None:
-            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
@@ -169,7 +242,7 @@ class NewsVerification:
             self.news_image,
         )
         if matched_url is not None:
-            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
@@ -187,50 +260,35 @@ class NewsVerification:
         self.image_prediction_score = 50
         self.image_referent_url = None
-    def determine_news_origin(self):
-        if self.text_prediction_label == "MACHINE":
-            text_prediction_score = 100 - self.text_prediction_score
-        elif self.text_prediction_label == "UNKNOWN":
-            text_prediction_score = 50
-        else:
-            text_prediction_score = self.text_prediction_score
-        if self.image_prediction_label == "MACHINE":
-            image_prediction_score = 100 - self.image_prediction_score
-        elif self.image_prediction_label == "UNKNOWN":
-            image_prediction_score = 50
-        else:
-            image_prediction_score = self.image_prediction_score
-        news_prediction_score = (
-            text_prediction_score + image_prediction_score
-        ) / 2
-        if news_prediction_score > 50:
-            self.news_prediction_score = news_prediction_score
-            self.news_prediction_label = "HUMAN"
-        else:
-            self.news_prediction_score = 100 - news_prediction_score
-            self.news_prediction_label = "MACHINE"
     def generate_analysis_report(self):
-        self.determine_text_origin()
-        self.determine_image_origin()
     def analyze_details(self):
         entities_with_colors = []
-        for index, aligned_sentence in enumerate(self.aligned_sentences):
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
-                aligned_sentence["input"],
-                aligned_sentence["source"],
             )
-            self.aligned_sentences[index]["entities"] = entities_with_colors
-        ordinary_user_table = self.create_ordinary_user_table()
-        fact_checker_table = self.create_fact_checker_table()
-        governor_table = self.create_governor_table()
-        return ordinary_user_table, fact_checker_table, governor_table
     def get_text_urls(self):
         return set(self.text_referent_url)
@@ -277,33 +335,52 @@ class NewsVerification:
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_fact_checker_row(max_length))
-        for aligned_sentence in self.aligned_sentences:
-            if "input" not in aligned_sentence:
                 continue
-            # Get index of equal phrases in input and source sentences
-            equal_idx_1, equal_idx_2 = extract_equal_text(
-                aligned_sentence["input"],
-                aligned_sentence["source"],
-            )
-            # Get entity-words (in pair) with colors
-            # entities_with_colors = highlight_entities(
-            #         aligned_sentence["input"],
-            #         aligned_sentence["source"],
-            #     )
             self.fact_checker_table.append(
                 [
-                    aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
-                    aligned_sentence["entities"],
                 ],
             )
-        for row in self.fact_checker_table:
-            formatted_row = self.format_text_fact_checker_row(row, max_length)
             rows.append(formatted_row)
         table = "\n".join(rows)
@@ -317,7 +394,7 @@ class NewsVerification:
     <thead>
         <tr>
             <th>Input news</th>
-            <th>Source (corresponding URL provided in Originality)</th>
             <th>Forensic</th>
             <th>Originality</th>
         </tr>
@@ -330,23 +407,38 @@ class NewsVerification:
 <style>
     """
-    def format_text_fact_checker_row(self, row, max_length=30):
         entity_count = 0
-        if row[0]["input"] == "":
             return ""
-        if row[0]["source"] != "":  # source is not empty
-            # highlight entities
-            input_sentence, highlight_idx_input = apply_highlight(
-                row[0]["input"],
-                row[3],
-                "input",
-            )
-            source_sentence, highlight_idx_source = apply_highlight(
-                row[0]["source"],
-                row[3],
-                "source",
-            )
-            entity_count = len(row[3])
             # Color overlapping words
             input_sentence = self.color_text(
@@ -360,6 +452,8 @@ class NewsVerification:
                 highlight_idx_source,
             )  # text, index of highlight words
             input_sentence = input_sentence.replace(
                 "span_style",
                 "span style",
@@ -372,23 +466,69 @@ class NewsVerification:
             input_sentence = row[0]["input"]
             source_sentence = row[0]["source"]
-        label = row[0]["label"]
-        score = row[0]["similarity"]
-        url = row[0]["url"]  #
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
         entity_count_text = self.get_entity_count_text(entity_count)
-        return f"""
-                <tr>
-                    <td>{input_sentence}</td>
-                    <td>{source_sentence}</td>
-                    <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>  # noqa: E501
-                    <td>{source_text_url}</td>
-                </tr>
-                """
     def format_image_fact_checker_row(self, max_length=30):
@@ -396,7 +536,7 @@ class NewsVerification:
             self.image_referent_url is not None
             or self.image_referent_url != ""
         ):
-            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""  # noqa: E501
             short_url = self.shorten_url(self.image_referent_url, max_length)
             source_image_url = (
                 f"""<a href="{self.image_referent_url}">{short_url}</a>"""
@@ -418,7 +558,6 @@ class NewsVerification:
 <h5>Comparison between input news and source news:</h5>
 <table border="1" style="width:100%; text-align:left;">
 <col style="width: 170px;">
-<col style="width: 170px;">
 <col style="width: 30px;">
 <col style="width: 75px;">
     <thead>
@@ -439,26 +578,22 @@ class NewsVerification:
     def format_text_ordinary_user_row(self, max_length=30):
         input_sentences = ""
         source_text_urls = ""
-        label = ""
-        scores = 0
-        sentence_count = 0
-        for index, row in enumerate(self.aligned_sentences):
-            if row["input"] == "":
                 continue
             input_sentences += row["input"] + "<br><br>"
-            label = self.aligned_sentences[index]["label"]
-            url = self.aligned_sentences[index]["url"]  #
-            short_url = self.shorten_url(url, max_length)
-            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
-            sentence_count += 1
-        scores, label = self.calculate_score_label()
         return f"""
                 <tr>
                     <td>{input_sentences}</td>
-                    <td>{label}<br>({scores * 100:.2f}%)</td>
                     <td>{source_text_urls}</td>
                 </tr>
                 """
@@ -484,28 +619,26 @@ class NewsVerification:
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_governor_row(max_length))
-        for aligned_sentence in self.aligned_sentences:
-            if "input" not in aligned_sentence:
                 continue
-            # Get index of equal phrases in input and source sentences
-            equal_idx_1, equal_idx_2 = extract_equal_text(
-                aligned_sentence["input"],
-                aligned_sentence["source"],
-            )
-            # Get entity-words (in pair) with colors
-            # entities_with_colors = highlight_entities(
-            #         aligned_sentence["input"],
-            #         aligned_sentence["source"],
-            #     )
             self.governor_table.append(
                 [
-                    aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
-                    aligned_sentence["entities"],
                 ],
             )
@@ -523,7 +656,7 @@ class NewsVerification:
     <thead>
         <tr>
             <th>Input news</th>
-            <th>Source (corresponding URL provided in Originality)</th>
             <th>Forensic</th>
             <th>Originality</th>
         </tr>
@@ -540,29 +673,27 @@ class NewsVerification:
         input_sentences = ""
         source_sentences = ""
         source_text_urls = ""
-        label = ""
         sentence_count = 0
-        entity_count = 0
         for row in self.governor_table:
-            print(f"governor_row: {row}")
-            if row[0]["input"] == "":
                 continue
-            if row[0]["source"] != "":  # source is not empty
                 # highlight entities
                 input_sentence, highlight_idx_input = apply_highlight(
                     row[0]["input"],
-                    row[3],
-                    "input",
-                    entity_count,
                 )
                 source_sentence, highlight_idx_source = apply_highlight(
                     row[0]["source"],
-                    row[3],
-                    "source",
-                    entity_count,
                 )
-                entity_count += len(row[3])
                 # Color overlapping words
                 input_sentence = self.color_text(
@@ -586,26 +717,35 @@ class NewsVerification:
                 ).replace("1px_4px", "1px 4px")
             else:
                 input_sentence = row[0]["input"]
-                source_sentence = row[0]["source"]
             # convert score to HUMAN-based score:
             input_sentences += input_sentence + "<br><br>"
             source_sentences += source_sentence + "<br><br>"
             url = row[0]["url"]
-            short_url = self.shorten_url(url, max_length)
-            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
-            sentence_count += 1
-        score, label = self.calculate_score_label()
-        entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
 <tr>
     <td>{input_sentences}</td>
     <td>{source_sentences}</td>
-    <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>
     <td>{source_text_urls}</td>
 </tr>
                 """
@@ -615,7 +755,7 @@ class NewsVerification:
             self.image_referent_url is not None
             or self.image_referent_url != ""
         ):
-            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""  # noqa: E501
             short_url = self.shorten_url(self.image_referent_url, max_length)
             source_image_url = (
                 f"""<a href="{self.image_referent_url}">{short_url}</a>"""
@@ -630,7 +770,7 @@ class NewsVerification:
         if entity_count <= 0:
             entity_count_text = ""
         elif entity_count == 1:
-            entity_count_text = "with altered entity"
         else:
             entity_count_text = "with altered entities"
         return entity_count_text
@@ -651,7 +791,7 @@ class NewsVerification:
         starts, ends = self.extract_starts_ends(colored_idx)
         starts, ends = self.filter_indices(starts, ends, highlighted_idx)
         previous_end = 0
         for start, end in zip(starts, ends):
             paragraph += " ".join(words[previous_end:start])
@@ -661,13 +801,7 @@ class NewsVerification:
             previous_end = end
-        # Some left words due to the punctuation separated from
-        # the highlighting text
-        equal_words = " ".join(words[previous_end:])
-        print(f"starts_2: {previous_end}")
-        print(f"ends_2: {len(words) - 1}")
-        print(f"equal_words: {words[previous_end:]}")
-        paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
         return paragraph
@@ -750,38 +884,12 @@ class NewsVerification:
                 end = number
             else:
                 starts.append(start)
-                ends.append(end + 1)
                 start = number
                 end = number
             if i == len(numbers) - 1:
                 starts.append(start)
-                ends.append(end + 1)
-        return starts, ends
-    def calculate_score_label(self):
-        human_score = []
-        machine_score = []
-        machine_flag = False
-        for sentence in self.aligned_sentences:
-            if sentence["input"] == "":
-                continue
-            if sentence["label"] == "HUMAN":
-                human_score.append(sentence["similarity"])
-            elif sentence["label"] == "MACHINE":
-                machine_score.append(1 - sentence["similarity"])
-                machine_flag = True
-        if machine_flag is True and len(machine_score) > 0:
-            # average value of machine_score
-            machine_score_avg = sum(machine_score) / len(machine_score)
-            if machine_score_avg < 0.5:
-                machine_score_avg = 1 - machine_score_avg
-            return machine_score_avg, "MACHINE"
-        elif machine_flag is False and len(human_score) > 0:
-            # average value of human_score
-            human_score_avg = sum(human_score) / len(human_score)
-            return human_score_avg, "HUMAN"
-        else:
-            return 0, "UNKNOWN"

     highlight_entities,
 )
 from src.application.text.helper import extract_equal_text
+from src.application.text.model_detection import detect_text_by_ai_model, predict_generation_model
 from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search_detection import (
+    PARAPHRASE_THRESHOLD_MACHINE,
     find_paragraph_source,
 )
         self.news_content = ""
         self.news_image = ""
+        self.text_prediction_label: list[str] = ["UNKNOWN"]
+        self.text_prediction_score: list[float] = [0.0]
+        self.image_prediction_label: list[str] = ["UNKNOWN"]
+        self.image_prediction_score: list[str] = [0.0]
         self.image_referent_url: list[str] = []
         self.news_prediction_label = ""
         self.news_prediction_score = -1
+        # news' urls to find img
         self.found_img_url: list[str] = []
+        # Analyzed results
+        self.aligned_paragraphs_df: pd.DataFrame = pd.DataFrame(
             columns=[
                 "input",
                 "source",
                 "entities",
             ],
         )
+        self.grouped_url_df: pd.DataFrame = pd.DataFrame()
+        # For formatting ouput tables
         self.ordinary_user_table: list = []
         self.fact_checker_table: list = []
         self.governor_table: list = []
     def load_news(self, news_title, news_content, news_image):
+        self.news_text = (news_title + "\n\n" + news_content).strip()
         self.news_title = news_title
         self.news_content = news_content
         self.news_image = news_image
     def determine_text_origin(self):
         self.find_text_source()
+        # Group inout and source by url
+        def concat_text(series):
+            return ' '.join(series.astype(str).tolist()) #Handle mixed data types and NaNs
+        self.grouped_url_df = self.aligned_paragraphs_df.groupby('url').agg(
+            {
+                'input': concat_text,
+                'source': concat_text,
+                }
+            )
+        self.grouped_url_df = self.grouped_url_df.reset_index()
+        # Add new columns for label and score
+        self.grouped_url_df["label"] = None
+        self.grouped_url_df["score"] = None
+        print(f"aligned_paragraphs_df:\n {self.aligned_paragraphs_df}")
+        for index, row in self.grouped_url_df.iterrows():
+            label, score = self.verify_text(row["url"])
+            if label == "UNKNOWN":
+                # Concatenate text from "input" in sentence_df
+                text = " ".join(row["input"])
+                # detect by baseline model
+                label, score = detect_text_by_ai_model(text)
+            self.grouped_url_df.at[index, "label"] = label
+            self.grouped_url_df.at[index, "score"] = score
+        # Overall label or score for the whole input text
+        if len(self.grouped_url_df) > 0:
+            # filter self.aligned_paragraphs_df["label"] if inclucind substring MACHINE
+            machine_label = self.grouped_url_df[
+                self.grouped_url_df["label"].str.contains("MACHINE", case=False, na=False)
+            ]
+            # machine_label = self.aligned_paragraphs_df[
+            #     self.aligned_paragraphs_df["label"] == "MACHINE"
+            # ]
+            if len(machine_label) > 0:
+                label = " ".join(machine_label["label"].tolist())
+                self.text_prediction_label[0] = label
+                self.text_prediction_score[0] = machine_label["score"].mean()
+            else:
+                machine_label = self.aligned_paragraphs_df[
+                    self.aligned_paragraphs_df["label"] == "HUMAN"
+                ]
+                self.text_prediction_label[0] = "HUMAN"
+                self.text_prediction_score[0] = machine_label["score"].mean()
+        else:  # no source found in the input text
+            print("No source found in the input text")
+            text = " ".join(self.aligned_paragraphs_df["input"].tolist())
             # detect by baseline model
+            label, score = detect_text_by_ai_model(text)
+            self.text_prediction_label[0] = label
+            self.text_prediction_score[0] = score
     def find_text_source(self):
         """
         Determines the origin of the given text based on paraphrasing detection
         # Setup df for input_sentences
         for _ in range(len(input_sentences)):
+            self.aligned_paragraphs_df = pd.concat(
+                [self.aligned_paragraphs_df, pd.DataFrame([{
                     "input": None,
                     "source": None,
                     "label": None,
             )
         # find a source for each paragraph
+        for index, _ in enumerate(input_sentences):
+            similarity = self.aligned_paragraphs_df.loc[index, "similarity"]
+            if similarity is not None:
+                if similarity > PARAPHRASE_THRESHOLD_MACHINE:
+                    continue
+            print(f"\n-------index = {index}-------")
+            print(f"current_text = {input_sentences[index]}\n")
+            self.aligned_paragraphs_df, img_urls = find_paragraph_source(
                 input_sentences,
                 index,
+                self.aligned_paragraphs_df,
             )
             self.found_img_url.extend(img_urls)
         # determine if the whole source is from a news or not
+    def verify_text(self, url):
+        label = "UNKNOWN"
+        score = 0
         # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
+        filtered_by_url = self.aligned_paragraphs_df[
+            self.aligned_paragraphs_df["url"] == url
         ]
+        filtered_by_similarity = filtered_by_url[
+            filtered_by_url["similarity"] > 0.8
+        ]
+        if len(filtered_by_similarity) / len(self.aligned_paragraphs_df) > 0.5:
+            # check if "MACHINE" is in self.aligned_sentences_df["label"]:
+            contains_machine = filtered_by_similarity["label"].str.contains(
+                "MACHINE", case=False, na=False
+                ).any()
+            if contains_machine:
+                label = "MACHINE"
+                machine_rows = filtered_by_similarity[
+                    filtered_by_similarity["label"].str.contains(
+                        "MACHINE",
+                        case=False,
+                        na=False)
+                    ]
+                generated_model, _ = predict_generation_model(self.news_text)
+                label += f"<br>({generated_model})"
+                score = machine_rows["similarity"].mean()
+            else:
+                label = "HUMAN"
+                human_rows = filtered_by_similarity[
+                    filtered_by_similarity["label"].str.contains(
+                        "HUMAN",
+                        case=False,
+                        na=False)
+                    ]
+                score = human_rows["similarity"].mean()
+        return label, score
     def determine_image_origin(self):
         print("CHECK IMAGE:")
             self.image_referent_url = None
             return
         matched_url, similarity = detect_image_from_news_image(
             self.news_image,
             self.found_img_url,
         )
         if matched_url is not None:
+            print(f"matched image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
             self.news_image,
         )
         if matched_url is not None:
+            print(f"matched image: {matched_url}\tScore: {similarity}%\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
         self.image_prediction_score = 50
         self.image_referent_url = None
     def generate_analysis_report(self):
+        if self.news_text != "":
+            self.determine_text_origin()
+        if self.news_image != "":
+            self.determine_image_origin()
     def analyze_details(self):
+        self.handle_entities()
+        ordinary_user_table = self.create_ordinary_user_table()
+        fact_checker_table = self.create_fact_checker_table()
+        governor_table = self.create_governor_table()
+        return ordinary_user_table, fact_checker_table, governor_table
+    def handle_entities(self):
         entities_with_colors = []
+        for index, row in self.grouped_url_df.iterrows():
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
+                row["input"],
+                row["source"],
             )
+            #self.grouped_url_df.at[index, "entities"] = entities_with_colors  # must use at
+            for index, paragraph in self.aligned_paragraphs_df.iterrows():
+                if paragraph["url"] == row["url"]:
+                    self.aligned_paragraphs_df.at[index, "entities"] = entities_with_colors # must use at
     def get_text_urls(self):
         return set(self.text_referent_url)
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_fact_checker_row(max_length))
+        for _, row in self.aligned_paragraphs_df.iterrows():
+            if row["input"] == None:
                 continue
+            if row["source"] == None:
+                equal_idx_1 = equal_idx_2 = []
+            else: # Get index of equal phrases in input and source sentences
+                equal_idx_1, equal_idx_2 = extract_equal_text(
+                    row["input"],
+                    row["source"],
+                )
             self.fact_checker_table.append(
                 [
+                    row,
                     equal_idx_1,
                     equal_idx_2,
+                    row["entities"],
+                    row["url"]
                 ],
             )
+        previous_url = None
+        span_row = 1
+        for index, row in enumerate(self.fact_checker_table):
+            current_url = row[4]
+            last_url_row = False
+            # First row or URL change
+            if index == 0 or current_url != previous_url:
+                first_url_row = True
+                previous_url = current_url
+                # Increase counter  "span_row" when the next url is the same
+                while index + span_row < len(self.fact_checker_table) \
+                    and self.fact_checker_table[index + span_row][4] == current_url:
+                    span_row += 1
+            else:
+                first_url_row = False
+                span_row -= 1
+            if span_row == 1:
+                last_url_row = True
+            formatted_row = self.format_text_fact_checker_row(row, first_url_row, last_url_row, span_row, max_length)
             rows.append(formatted_row)
         table = "\n".join(rows)
     <thead>
         <tr>
             <th>Input news</th>
+            <th>Source (URL in Originality)</th>
             <th>Forensic</th>
             <th>Originality</th>
         </tr>
 <style>
     """
+    def format_text_fact_checker_row(
+        self,
+        row,
+        first_url_row=True,
+        last_url_row=True,
+        span_row=1,
+        max_length=30,
+    ):
         entity_count = 0
+        if row[0]["input"] is None:
             return ""
+        if row[0]["source"] is not None:  # source is not empty
+            if row[3] is not None:
+                # highlight entities
+                input_sentence, highlight_idx_input = apply_highlight(
+                    row[0]["input"],
+                    row[3],
+                    "input",
+                )
+                source_sentence, highlight_idx_source = apply_highlight(
+                    row[0]["source"],
+                    row[3],
+                    "source",
+                )
+            else:
+                input_sentence = row[0]["input"]
+                source_sentence = row[0]["source"]
+                highlight_idx_input = []
+                highlight_idx_source = []
+            if row[3] is not None:
+                entity_count = len(row[3])
             # Color overlapping words
             input_sentence = self.color_text(
                 highlight_idx_source,
             )  # text, index of highlight words
+            # Replace _ to get correct formatting
+            # Original one having _ for correct word counting
             input_sentence = input_sentence.replace(
                 "span_style",
                 "span style",
             input_sentence = row[0]["input"]
             source_sentence = row[0]["source"]
+        url = row[0]["url"]
+        # Displayed label and score by url
+        filterby_url = self.grouped_url_df[
+            self.grouped_url_df["url"] == url
+        ]
+        if len(filterby_url) > 0:
+            label = filterby_url["label"].values[0]
+            score = filterby_url["score"].values[0]
+        else:
+            label = self.text_prediction_label[0]
+            score = self.text_prediction_score[0]
+        # Format displayed url
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
+        # Format displayed entity count
         entity_count_text = self.get_entity_count_text(entity_count)
+        border_top = "border-top: 1px solid transparent;"
+        border_bottom = "border-bottom: 1px solid transparent;"
+        if first_url_row is True:
+            # First & Last the group: no transparent
+            if last_url_row is True:
+                return f"""
+<tr>
+    <td>{input_sentence}</td>
+    <td>{source_sentence}</td>
+    <td rowspan="{span_row}">{label}<br>
+    ({score * 100:.2f}%)<br><br>
+    {entity_count_text}</td>
+    <td rowspan="{span_row}">{source_text_url}</td>
+</tr>
+"""
+            # First row of the group: transparent bottom border
+            return f"""
+<tr>
+    <td style="{border_bottom}";>{input_sentence}</td>
+    <td style="{border_bottom}";>{source_sentence}</td>
+    <td rowspan="{span_row}">{label}<br>
+    ({score * 100:.2f}%)<br><br>
+    {entity_count_text}</td>
+    <td rowspan="{span_row}">{source_text_url}</td>
+</tr>
+"""
+        else:
+            if last_url_row is True:
+                # NOT First row, Last row: transparent top border
+                return f"""
+<tr>
+    <td style="{border_top}";>{input_sentence}</td>
+    <td style="{border_top}";>{source_sentence}</td>
+</tr>
+"""
+            else:
+                # NOT First & NOT Last row: transparent top & bottom borders
+                return f"""
+<tr>
+    <td style="{border_top} {border_bottom}";>{input_sentence}</td>
+    <td style="{border_top} {border_bottom}";>{source_sentence}</td>
+</tr>
+"""
     def format_image_fact_checker_row(self, max_length=30):
             self.image_referent_url is not None
             or self.image_referent_url != ""
         ):
+            source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">"""  # noqa: E501
             short_url = self.shorten_url(self.image_referent_url, max_length)
             source_image_url = (
                 f"""<a href="{self.image_referent_url}">{short_url}</a>"""
 <h5>Comparison between input news and source news:</h5>
 <table border="1" style="width:100%; text-align:left;">
 <col style="width: 170px;">
 <col style="width: 30px;">
 <col style="width: 75px;">
     <thead>
     def format_text_ordinary_user_row(self, max_length=30):
         input_sentences = ""
         source_text_urls = ""
+        urls = []
+        for _, row in self.aligned_paragraphs_df.iterrows():
+            if row["input"] == None:
                 continue
             input_sentences += row["input"] + "<br><br>"
+            url = row["url"]
+            if url not in urls:
+                urls.append(url)
+                short_url = self.shorten_url(url, max_length)
+                source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
         return f"""
                 <tr>
                     <td>{input_sentences}</td>
+                    <td>{self.text_prediction_label[0]}<br>
+                    ({self.text_prediction_score[0] * 100:.2f}%)</td>
                     <td>{source_text_urls}</td>
                 </tr>
                 """
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_governor_row(max_length))
+        for _, row in self.aligned_paragraphs_df.iterrows():
+            if row["input"] == None:
                 continue
+            if row["source"] == None:
+                equal_idx_1 = equal_idx_2 = []
+            else:
+                # Get index of equal phrases in input and source sentences
+                equal_idx_1, equal_idx_2 = extract_equal_text(
+                    row["input"],
+                    row["source"],
+                )
             self.governor_table.append(
                 [
+                    row,
                     equal_idx_1,
                     equal_idx_2,
+                    row["entities"],
                 ],
             )
     <thead>
         <tr>
             <th>Input news</th>
+            <th>Source (URL in Originality)</th>
             <th>Forensic</th>
             <th>Originality</th>
         </tr>
         input_sentences = ""
         source_sentences = ""
         source_text_urls = ""
+        urls = []
         sentence_count = 0
+        entity_count = [0, 0]  # to get index of [-2]
         for row in self.governor_table:
+            if row[0]["input"] is None:
                 continue
+            if row[0]["source"] is not None and row[3] is not None:  # source is not empty
                 # highlight entities
                 input_sentence, highlight_idx_input = apply_highlight(
                     row[0]["input"],
+                    row[3],  # entities_with_colors
+                    "input",  # key
+                    entity_count[-2],  # since the last one is for current counting
                 )
                 source_sentence, highlight_idx_source = apply_highlight(
                     row[0]["source"],
+                    row[3],  # entities_with_colors
+                    "source",  # key
+                    entity_count[-2],  # since the last one is for current counting
                 )
                 # Color overlapping words
                 input_sentence = self.color_text(
                 ).replace("1px_4px", "1px 4px")
             else:
+                if row[0]["source"] is None:
+                    source_sentence = ""
+                else:
+                    source_sentence = row[0]["source"]
                 input_sentence = row[0]["input"]
             # convert score to HUMAN-based score:
             input_sentences += input_sentence + "<br><br>"
             source_sentences += source_sentence + "<br><br>"
             url = row[0]["url"]
+            if url not in urls:
+                urls.append(url)
+                short_url = self.shorten_url(url, max_length)
+                source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
+                sentence_count += 1
+                if row[3] is not None:
+                    entity_count.append(len(row[3]))
+        entity_count_text = self.get_entity_count_text(sum(entity_count))
         return f"""
 <tr>
     <td>{input_sentences}</td>
     <td>{source_sentences}</td>
+    <td>{self.text_prediction_label[0]}<br>
+        ({self.text_prediction_score[0] * 100:.2f}%)<br><br>
+        {entity_count_text}</td>
     <td>{source_text_urls}</td>
 </tr>
                 """
             self.image_referent_url is not None
             or self.image_referent_url != ""
         ):
+            source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">"""  # noqa: E501
             short_url = self.shorten_url(self.image_referent_url, max_length)
             source_image_url = (
                 f"""<a href="{self.image_referent_url}">{short_url}</a>"""
         if entity_count <= 0:
             entity_count_text = ""
         elif entity_count == 1:
+            entity_count_text = "with 1 altered entity"
         else:
             entity_count_text = "with altered entities"
         return entity_count_text
         starts, ends = self.extract_starts_ends(colored_idx)
         starts, ends = self.filter_indices(starts, ends, highlighted_idx)
         previous_end = 0
         for start, end in zip(starts, ends):
             paragraph += " ".join(words[previous_end:start])
             previous_end = end
+        paragraph += " ".join(words[previous_end:])
         return paragraph
                 end = number
             else:
                 starts.append(start)
+                ends.append(end)
                 start = number
                 end = number
             if i == len(numbers) - 1:
                 starts.append(start)
+                ends.append(end)
+        return starts, ends

src/application/text/entity.py CHANGED Viewed

@@ -161,16 +161,17 @@ def assign_colors_to_entities(entities):
 def highlight_entities(text1, text2):
-    if text1 == "" or text2 == "":
-        return []
     entities_text = extract_entities_gpt(text1, text2)
-    print(f"entities_text: {entities_text}")
     # Clean up entities: remove wrapping characters
     entities_text = entities_text.replace("```json", "").replace("```", "")
     entities = read_json(entities_text)
     # Assign colors to entities
     entities_with_colors = assign_colors_to_entities(entities)
@@ -179,7 +180,7 @@ def highlight_entities(text1, text2):
 def apply_highlight(text, entities_with_colors, key="input", count=0):
-    if entities_with_colors == []:
         return text, []
     all_starts = []

 def highlight_entities(text1, text2):
+    if text1 == None or text2 == None:
+        return None
     entities_text = extract_entities_gpt(text1, text2)
     # Clean up entities: remove wrapping characters
     entities_text = entities_text.replace("```json", "").replace("```", "")
     entities = read_json(entities_text)
+    if len(entities) == 0:
+        return None
     # Assign colors to entities
     entities_with_colors = assign_colors_to_entities(entities)
 def apply_highlight(text, entities_with_colors, key="input", count=0):
+    if entities_with_colors is None:
         return text, []
     all_starts = []

src/application/text/helper.py CHANGED Viewed

@@ -147,7 +147,7 @@ def extract_equal_text(text1, text2):
         text = text.lower()
         text = text.translate(str.maketrans("", "", string.punctuation))
         return text
     splited_text1 = cleanup(text1).split()
     splited_text2 = cleanup(text2).split()
@@ -163,8 +163,7 @@ def extract_equal_text(text1, text2):
             equal_idx_2.append({"start": j1, "end": j2})
             # subtext_1 = " ".join(text1[i1:i2])
             # subtext_2 = " ".join(text2[j1:j2])
-            # print(f'{tag:7}   a[{i1:2}:{i2:2}]
-            # --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
     return equal_idx_1, equal_idx_2

         text = text.lower()
         text = text.translate(str.maketrans("", "", string.punctuation))
         return text
     splited_text1 = cleanup(text1).split()
     splited_text2 = cleanup(text2).split()
             equal_idx_2.append({"start": j1, "end": j2})
             # subtext_1 = " ".join(text1[i1:i2])
             # subtext_2 = " ".join(text2[j1:j2])
+            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
     return equal_idx_1, equal_idx_2

src/application/text/model_detection.py CHANGED Viewed

@@ -1,24 +1,48 @@
 from transformers import pipeline
 # TODO: move to a config file
-DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
-MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"}
 HUMAN = "HUMAN"
 MACHINE = "MACHINE"
 UNKNOWN = "UNKNOWN"
 PARAPHRASE = "PARAPHRASE"
 NON_PARAPHRASE = "NON_PARAPHRASE"
 def detect_text_by_ai_model(
     input_text: str,
-    model: str = DEFAULT_MODEL,
     max_length: int = 512,
 ) -> tuple:
     """
-    Model: chatgpt_detector_roberta
-    Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
     Detects if text is human or machine generated.
@@ -42,7 +66,89 @@ def detect_text_by_ai_model(
             label = HUMAN
         else:
             label = MACHINE
         return label, confidence_score
     except Exception as e:  # Add exception handling
         print(f"Error in Roberta model inference: {e}")
         return UNKNOWN, 0.5  # Return UNKNOWN and 0.0 confidence if error

 from transformers import pipeline
+import os
+from dotenv import load_dotenv
+from openai import AzureOpenAI, OpenAIError
+from sentence_transformers import SentenceTransformer, util
+import torch
+load_dotenv()
+AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
+azure_client = AzureOpenAI(
+    azure_endpoint="https://quoc-nguyen.openai.azure.com/",
+    api_key=AZURE_OPENAI_API_KEY,
+    api_version="2024-05-01-preview",
+)
 # TODO: move to a config file
+# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
+AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
+MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
 HUMAN = "HUMAN"
 MACHINE = "MACHINE"
 UNKNOWN = "UNKNOWN"
 PARAPHRASE = "PARAPHRASE"
 NON_PARAPHRASE = "NON_PARAPHRASE"
+# load the embedding model
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
+PARAPHASE_MODEL.to(DEVICE)
 def detect_text_by_ai_model(
     input_text: str,
+    model: str = AI_TEXT_DECTECTION_MODEL,
     max_length: int = 512,
 ) -> tuple:
     """
+    Model: RADAR-Vicuna-7B
+    Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B
     Detects if text is human or machine generated.
             label = HUMAN
         else:
             label = MACHINE
+            generated_model, _ = predict_generation_model(input_text)
+            label += f"<br>({generated_model})"
         return label, confidence_score
     except Exception as e:  # Add exception handling
         print(f"Error in Roberta model inference: {e}")
         return UNKNOWN, 0.5  # Return UNKNOWN and 0.0 confidence if error
+def predict_generation_model(text:str) -> tuple[str, float]:
+    """
+    Predicts if text is generated by gpt-4o or gpt-4o-mini models.
+    Compare the input text against the paraphrased text by the models.
+    Returns:
+        tuple: (label, confidence_score)
+            where label is gpt-4o or gpt-4o-mini.
+    """
+    best_similarity = 0
+    best_model = "gpt-4o"
+    models = ["gpt-4o", "gpt-4o-mini"]
+    for model in models:
+        paraphrased_text = paraphrase_by_AI(text, model)
+        if paraphrased_text is None:
+            continue
+        similarity = measure_text_similarity(text, paraphrased_text)
+        if similarity > best_similarity:
+            best_similarity = similarity
+            best_model = model
+    return best_model, best_similarity
+def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
+    """
+    Paraphrase text using a given model.
+    Returns:
+        str: Paraphrased text.
+    """
+    prompt = f"""
+Paraphrase the following news, only output the paraphrased text:
+{input_text}
+"""
+    try:
+        response = azure_client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "user", "content": prompt},
+            ],
+            # max_tokens=100,
+            # temperature=0.7,
+            # top_p=0.9,
+            # n=1,
+        )
+        paraphrased_text = response.choices[0].message.content
+        return paraphrased_text
+    except OpenAIError as e:  # Add exception handling
+        print(f"Error in AI model inference: {e}")
+        return None
+def measure_text_similarity(text1: str, text2: str) -> float:
+    """
+    Measure the similarity between two texts.
+    Returns:
+        float: Similarity score.
+    """
+    embeddings1 = PARAPHASE_MODEL.encode(
+        text1,
+        convert_to_tensor=True,
+        device=DEVICE,
+        show_progress_bar=False,
+    )
+    embeddings2 = PARAPHASE_MODEL.encode(
+        text2,
+        convert_to_tensor=True,
+        device=DEVICE,
+        show_progress_bar=False,
+    )
+    # Compute cosine similarity matrix
+    similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
+    print(similarity[0][0])
+    return similarity[0][0]

src/application/text/search.py CHANGED Viewed

@@ -174,7 +174,6 @@ def generate_search_phrases(input_text):
     # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
     text_without_entities = remove_identities_from_text(input_text, entities)
-    print(f"text_without_entities: {text_without_entities}")
     search_phrases.append(text_without_entities)
     # keywords = get_keywords(input_text, 16)
     # search_phrase = " ".join(entities) + " " + " ".join(keywords)

     # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
     text_without_entities = remove_identities_from_text(input_text, entities)
     search_phrases.append(text_without_entities)
     # keywords = get_keywords(input_text, 16)
     # search_phrase = " ".join(entities) + " " + " ".join(keywords)

src/application/text/search_detection.py CHANGED Viewed

@@ -1,17 +1,14 @@
-import string
 import warnings
 from difflib import SequenceMatcher
 import nltk
 import numpy as np
-import pandas as pd
 import torch
 from sentence_transformers import (
     SentenceTransformer,
     util,
 )
-from src.application.text.helper import extract_equal_text
 from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search import (
     generate_search_phrases,
@@ -41,102 +38,11 @@ MIN_RATIO_PARAPHRASE_NUM = 0.5
 MAX_CHAR_SIZE = 30000
-def detect_text_by_relative_search(
-    input_text,
-    index,
-    is_support_opposite=False,
-):
-    checked_urls = set()
-    searched_phrases = generate_search_phrases(input_text[index])
-    for candidate in searched_phrases:
-        search_results = search_by_google(candidate)
-        urls = [item["link"] for item in search_results.get("items", [])]
-        for url in urls[:3]:
-            if url in checked_urls:  # visited url
-                continue
-            if "bbc.com" not in url:
-                continue
-            checked_urls.add(url)
-            print(f"\t\tChecking URL: {url}")
-            content = URLReader(url)
-            if content.is_extracted is True:
-                if content.title is None or content.text is None:
-                    print("\t\t\t↑↑↑ Title or text not found")
-                    continue
-                page_text = content.title + "\n" + content.text
-                if len(page_text) > MAX_CHAR_SIZE:
-                    print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
-                    continue
-                print(f"\t\t\t↑↑↑ Title: {content.title}")
-                aligned_first_sentences = check_paraphrase(
-                    input_text[index],
-                    page_text,
-                    url,
-                )
-                is_paraphrased = aligned_first_sentences["is_paraphrased"]
-                if is_paraphrased is False:
-                    return (
-                        is_paraphrased,
-                        url,
-                        aligned_first_sentences,
-                        content.images,
-                        index,
-                    )
-                sub_paraphrase = True
-                while sub_paraphrase is True:
-                    index += 1
-                    print(f"----search {index} < {len(input_text)}----")
-                    if index >= len(input_text):
-                        print(f"input_text_last: {input_text[-1]}")
-                        break
-                    print(f"input_text: {input_text[index]}")
-                    sub_sentences = check_paraphrase(
-                        input_text[index],
-                        page_text,
-                        url,
-                    )
-                    sub_paraphrase = sub_sentences["is_paraphrased"]
-                    print(f"sub_paraphrase: {sub_paraphrase}")
-                    print(f"sub_sentences: {sub_sentences}")
-                    if sub_paraphrase is True:
-                        aligned_first_sentences["input"] += (
-                            "<br>" + sub_sentences["input"]
-                        )
-                        aligned_first_sentences["source"] += (
-                            "<br>" + sub_sentences["source"]
-                        )
-                        aligned_first_sentences["similarity"] += sub_sentences[
-                            "similarity"
-                        ]
-                        aligned_first_sentences["similarity"] /= 2
-                print(f"paraphrase: {is_paraphrased}")
-                print(f"aligned_first_sentences: {aligned_first_sentences}")
-                return (
-                    is_paraphrased,
-                    url,
-                    aligned_first_sentences,
-                    content.images,
-                    index,
-                )
-    return False, None, [], [], index
 def find_paragraph_source(text, text_index, sentences_df):
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
-    print(f"text[text_index]: {text[text_index]}")
-    print(f"searched_phrases: {searched_phrases}")
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
         urls = [item["link"] for item in search_results.get("items", [])]
@@ -169,11 +75,10 @@ def find_paragraph_source(text, text_index, sentences_df):
                 )
                 if aligned_sentence["paraphrase"] is False:
-                    print(f'sentence_1: {sentences_df.loc[text_index, "input"]}')
-                    print(f'sentence_2: {aligned_sentence["input"]}')
                     sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
                     sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
                     return sentences_df, []
                 # assign values
                 columns = [
                     "input",
@@ -187,32 +92,29 @@ def find_paragraph_source(text, text_index, sentences_df):
                     if c in sentences_df.columns:
                         sentences_df.loc[text_index, c] = aligned_sentence[c]
-                print(f"sen: {sentences_df}")
-                for idx, _ in enumerate(sentences_df):
-                    print(f"{idx}")
-                    if idx > len(sentences_df):
-                        break
-                    if sentences_df.loc[idx, "url"] is not None:
-                        continue
-                    # find content in new url
                     aligned_sentence = check_paraphrase(
                         text[idx],
                         page_text,
                         url,
                     )
-                    if aligned_sentence["url"] is None:
-                        continue
-                    columns = ["input", "source", "label", "similarity", "url"]
-                    for c in columns:
-                        if c in sentences_df.columns:
-                            sentences_df.loc[text_index, c] = aligned_sentence[c]
                 return sentences_df, content.images
     return sentences_df, []
@@ -344,11 +246,13 @@ def check_paraphrase(input_text, page_text, url):
         input_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
     )
     embeddings2 = PARAPHASE_MODEL.encode(
         page_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
     )
     # Compute cosine similarity matrix
@@ -361,12 +265,7 @@ def check_paraphrase(input_text, page_text, url):
         max_similarity = similarity_matrix[i][max_sim_index]
         label, is_paraphrased = determine_label(max_similarity)
-        print(f"is_paraphrased: {is_paraphrased}")
-        if is_paraphrased is False:
-            url = None
-            best_matched_paragraph = None
-        else:
-            best_matched_paragraph = page_paragraphs[max_sim_index]
         alignment = {
             "input": paragraph,
@@ -376,6 +275,7 @@ def check_paraphrase(input_text, page_text, url):
             "paraphrase": is_paraphrased,
             "url": url,
         }
     return alignment
@@ -423,7 +323,7 @@ def determine_label(similarity):
     elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
         return "MACHINE", True
     else:
-        return "", False
 if __name__ == "__main__":

 import warnings
 from difflib import SequenceMatcher
 import nltk
 import numpy as np
 import torch
 from sentence_transformers import (
     SentenceTransformer,
     util,
 )
 from src.application.text.preprocessing import split_into_paragraphs
 from src.application.text.search import (
     generate_search_phrases,
 MAX_CHAR_SIZE = 30000
 def find_paragraph_source(text, text_index, sentences_df):
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
         urls = [item["link"] for item in search_results.get("items", [])]
                 )
                 if aligned_sentence["paraphrase"] is False:
                     sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
                     sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
                     return sentences_df, []
                 # assign values
                 columns = [
                     "input",
                     if c in sentences_df.columns:
                         sentences_df.loc[text_index, c] = aligned_sentence[c]
+                for idx, _ in sentences_df.iterrows():
+                    similarity = sentences_df.loc[idx, "similarity"]
+                    if similarity is not None:
+                        if similarity > PARAPHRASE_THRESHOLD_MACHINE:
+                            continue
+                    # find matched content in new url
                     aligned_sentence = check_paraphrase(
                         text[idx],
                         page_text,
                         url,
                     )
+                    if similarity is None or \
+                        aligned_sentence["similarity"] > similarity:
+                            columns = ["input", "source", "label", "similarity", "url"]
+                            for c in columns:
+                                if c in sentences_df.columns:
+                                    sentences_df.loc[idx, c] = aligned_sentence[c]
                 return sentences_df, content.images
+    sentences_df.loc[text_index, "input"] = text[text_index]
     return sentences_df, []
         input_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
+        show_progress_bar=False,
     )
     embeddings2 = PARAPHASE_MODEL.encode(
         page_paragraphs,
         convert_to_tensor=True,
         device=DEVICE,
+        show_progress_bar=False,
     )
     # Compute cosine similarity matrix
         max_similarity = similarity_matrix[i][max_sim_index]
         label, is_paraphrased = determine_label(max_similarity)
+        best_matched_paragraph = page_paragraphs[max_sim_index]
         alignment = {
             "input": paragraph,
             "paraphrase": is_paraphrased,
             "url": url,
         }
+        print(f"Result: [{alignment["similarity"]}] {alignment["source"]}")
     return alignment
     elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
         return "MACHINE", True
     else:
+        return None, False
 if __name__ == "__main__":

test.py CHANGED Viewed

@@ -1,13 +1,2 @@
-import numpy as np
-import pandas as pd
-# Create an empty DataFrame with 5 columns
-df = pd.DataFrame(columns=['col1', 'col2', 'col3', 'col4', 'col5'])  # Or any column names you want
-# Method 1: Using a dictionary and append (less efficient for large DataFrames)
-for _ in range(5):  # Add 5 rows
-    df = pd.concat([df, pd.DataFrame([{'col1': np.nan, 'col2': np.nan, 'col3': np.nan, 'col4': np.nan, 'col5': np.nan}])], ignore_index=True)
-d =  {"col1": "ta", "col2": "gs"}
-df.loc[1, "col1"] = d["col1"]
-for index, row in enumerate(df):
-    print(index)


1	+ my_list = [0, 0]
2	+ print(my_list[-2])