Spaces:

pmkhanh7890
/

news_verification

Running

App Files Files

pmkhanh7890 commited on Feb 28

Commit

e58707f

1 Parent(s): b73a4fc

refactor

Browse files

Files changed (11) hide show

src/application/config.py +1 -1
src/application/content_detection.py +105 -526
src/application/content_generation.py +2 -1
src/application/formatting.py +17 -9
src/application/formatting_fact_checker.py +238 -0
src/application/formatting_governor.py +165 -0
src/application/formatting_ordinary_user.py +33 -29
src/application/image/image.py +5 -0
src/application/text/helper.py +23 -13
src/application/text/search_detection.py +2 -1
src/application/text/text.py +14 -0

src/application/config.py CHANGED Viewed

@@ -88,4 +88,4 @@ ENTITY_BRIGHTNESS = 0.75  # color's brightness.
 # HTML formatting
-WORD_BREAK = "word-break: break-all;"


88
89
90	# HTML formatting
91	+ WORD_BREAK = "word-break: break-all;"

src/application/content_detection.py CHANGED Viewed

@@ -5,19 +5,22 @@ Date: 2024-12-04
 import pandas as pd
-from src.application.config import MIN_RATIO_PARAPHRASE_NUM, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_MACHINE
-from src.application.formatting import color_text, format_entity_count
 from src.application.image.image_detection import (
     detect_image_by_ai_model,
     detect_image_by_reverse_search,
     detect_image_from_news_image,
 )
-from src.application.text.entity import (
-    apply_highlight,
-    highlight_entities,
-)
 from src.application.text.helper import (
-    extract_equal_text,
     postprocess_label,
     split_into_paragraphs,
 )
@@ -26,6 +29,7 @@ from src.application.text.model_detection import (
     predict_generation_model,
 )
 from src.application.text.search_detection import find_sentence_source
 class NewsVerification:
@@ -38,12 +42,8 @@ class NewsVerification:
         self.news_content: str = ""
         self.news_image: str = ""
-        self.text_prediction_label: list[str] = ["UNKNOWN"]
-        self.text_prediction_score: list[float] = [0.0]
-        self.image_prediction_label: list[str] = ["UNKNOWN"]
-        self.image_prediction_score: list[str] = [0.0]
-        self.image_referent_url: list[str] = []
         self.news_prediction_label: str = ""
         self.news_prediction_score: float = -1
@@ -63,12 +63,6 @@ class NewsVerification:
                 # "entities",
             ],
         )
-        self.grouped_url_df: pd.DataFrame = pd.DataFrame()
-        # For formatting ouput tables
-        self.ordinary_user_table: list = []
-        self.fact_checker_table: list = []
-        self.governor_table: list = []
     def load_news(self, news_title: str, news_content: str, news_image: str):
         """
@@ -111,7 +105,7 @@ class NewsVerification:
             )  # Handle mixed data types and NaNs
         # Group sentences by URL and concatenate 'input' and 'source' text.
-        self.grouped_url_df = (
             self.aligned_sentences_df.groupby("url")
             .agg(
                 {
@@ -123,8 +117,8 @@ class NewsVerification:
         )  # Reset index to make 'url' a regular column
         # Add new columns for label and score
-        self.grouped_url_df["label"] = None
-        self.grouped_url_df["score"] = None
         print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
@@ -132,7 +126,7 @@ class NewsVerification:
         """
         Determines the text origin for each URL group.
         """
-        for index, row in self.grouped_url_df.iterrows():
             # Verify text origin using URL-based verification.
             label, score = self.verify_text(row["url"])
@@ -144,8 +138,8 @@ class NewsVerification:
                 # Detect text origin using an AI model.
                 label, score = detect_text_by_ai_model(text)
-            self.grouped_url_df.at[index, "label"] = label
-            self.grouped_url_df.at[index, "score"] = score
     def determine_text_origin(self):
         """
@@ -166,10 +160,10 @@ class NewsVerification:
         self.determine_text_origin_by_url()
         # Determine the overall label and score for the entire input text.
-        if not self.grouped_url_df.empty:
             # Check for 'gpt-4o' labels in the grouped URLs.
-            machine_label = self.grouped_url_df[
-                self.grouped_url_df["label"].str.contains(
                     "gpt-4o",
                     case=False,
                     na=False,
@@ -183,15 +177,15 @@ class NewsVerification:
                 # labels = " and ".join(machine_label["label"].tolist())
                 # label = remove_duplicate_words(label)
-                self.text_prediction_label[0] = label
-                self.text_prediction_score[0] = machine_label["score"].mean()
             else:
                 # If no 'gpt-4o' labels, assign for 'HUMAN' labels.
                 machine_label = self.aligned_sentences_df[
                     self.aligned_sentences_df["label"] == "HUMAN"
                 ]
-                self.text_prediction_label[0] = "HUMAN"
-                self.text_prediction_score[0] = machine_label["score"].mean()
         else:
             # If no found URLs, use AI detection on the entire input text.
             print("No source found in the input text")
@@ -199,34 +193,40 @@ class NewsVerification:
             # Detect text origin using an AI model.
             label, score = detect_text_by_ai_model(text)
-            self.text_prediction_label[0] = label
-            self.text_prediction_score[0] = score
     def find_text_source(self):
         """
         Determines the origin of the given text based on paraphrasing
             detection and human authorship analysis.
-        1. Splits the input news text into sentences,
         2. Searches for sources for each sentence
         3. Updates the aligned_sentences_df with the found sources.
         """
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         input_paragraphs = split_into_paragraphs(self.news_text)
-        # Initialize an empty DataFrame if it doesn't exist, otherwise extend it.
-        if not hasattr(self, 'aligned_sentences_df') or self.aligned_sentences_df is None:
-            self.aligned_sentences_df = pd.DataFrame(columns=[
-                "input",
-                "source",
-                "label",
-                "similarity",
-                "paraphrase",
-                "url",
-                "entities",
-                ])
         # Setup DataFrame for input_sentences
         for _ in range(len(input_paragraphs)):
@@ -265,19 +265,19 @@ class NewsVerification:
                 index,
                 self.aligned_sentences_df,
             )
             # Initialize found_img_url if it does not exist.
-            if not hasattr(self, 'found_img_url'):
                 self.found_img_url = []
             self.found_img_url.extend(img_urls)
     def verify_text(self, url):
         """
-        Verifies the text origin based on similarity scores and labels
             associated with a given URL.
-        1. Filters sentences by URL and similarity score,
-        2. Determines if the text is likely generated by a machine or a human.
         3. Calculates an average similarity score.
         Args:
@@ -285,27 +285,30 @@ class NewsVerification:
         Returns:
             tuple: A
-                - Label ("MACHINE", "HUMAN", or "UNKNOWN")
                 - Score
         """
         label = "UNKNOWN"
         score = 0
         # calculate the average similarity when the similary score
         # in each row of sentences_df is higher than 0.8
         # Filter sentences by URL.
         filtered_by_url = self.aligned_sentences_df[
             self.aligned_sentences_df["url"] == url
         ]
         # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
         filtered_by_similarity = filtered_by_url[
             filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
         ]
         # Check if a ratio of remaining filtering-sentences is more than 50%.
-        if len(filtered_by_similarity) / len(self.aligned_sentences_df) > MIN_RATIO_PARAPHRASE_NUM:
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
             contains_machine = (
                 filtered_by_similarity["label"]
@@ -316,7 +319,7 @@ class NewsVerification:
                 )
                 .any()
             )
             # TODO: integrate with determine_text_origin
             if contains_machine:
                 # If "MACHINE" label is present, set label and calculate score.
@@ -331,7 +334,8 @@ class NewsVerification:
                 label = f"Partially generated by {generated_model}"
                 score = machine_rows["similarity"].mean()
             else:
-                # If no "MACHINE" label, assign "HUMAN" label and calculate score.
                 label = "HUMAN"
                 human_rows = filtered_by_similarity[
                     filtered_by_similarity["label"].str.contains(
@@ -346,21 +350,21 @@ class NewsVerification:
     def determine_image_origin(self):
         """
-        Determines the origin of the news image using various detection methods.
         1.  Matching against previously found image URLs.
         2.  Reverse image search.
         3.  AI-based image detection.
-        If none of these methods succeed, the image origin is marked as "UNKNOWN".
         """
         print("CHECK IMAGE:")
         # Handle the case where no image is provided.
         if self.news_image is None:
-            self.image_prediction_label = "UNKNOWN"
-            self.image_prediction_score = 0.0
-            self.image_referent_url = None
             return
         # Attempt to match the image against previously found image URLs.
@@ -371,9 +375,9 @@ class NewsVerification:
         )
         if matched_url is not None:
             print(f"matched image: {matched_url}\nsimilarity: {similarity}\n")
-            self.image_prediction_label = "HUMAN"
-            self.image_prediction_score = similarity
-            self.image_referent_url = matched_url
             return
         # Attempt to find the image origin using reverse image search.
@@ -383,9 +387,9 @@ class NewsVerification:
         )
         if matched_url is not None:
             print(f"matched image: {matched_url}\tScore: {similarity}%\n")
-            self.image_prediction_label = "HUMAN"
-            self.image_prediction_score = similarity
-            self.image_referent_url = matched_url
             return
         # Attempt to detect the image origin using an AI model.
@@ -393,15 +397,15 @@ class NewsVerification:
         detected_label, score = detect_image_by_ai_model(self.news_image)
         if detected_label:
             print(f"detected_label: {detected_label} ({score})")
-            self.image_prediction_label = detected_label
-            self.image_prediction_score = score
-            self.image_referent_url = None
             return
         # If all detection methods fail, mark the image origin as "UNKNOWN".
-        self.image_prediction_label = "UNKNOWN"
-        self.image_prediction_score = 50
-        self.image_referent_url = None
     def determine_origin(self):
         """
@@ -411,13 +415,13 @@ class NewsVerification:
             self.determine_text_origin()
         if self.news_image != "":
             self.determine_image_origin()
         # Handle entity recognition and processing.
         self.handle_entities()
     def generate_report(self) -> tuple[str, str, str]:
         """
-        Generates reports tailored for different user roles
             (ordinary users, fact checkers, governors).
         Returns:
@@ -426,9 +430,21 @@ class NewsVerification:
                 - fact_checker_table: Report for fact checkers.
                 - governor_table: Report for governors.
         """
-        ordinary_user_table = self.create_ordinary_user_table()
-        fact_checker_table = self.create_fact_checker_table()
-        governor_table = self.create_governor_table()
         return ordinary_user_table, fact_checker_table, governor_table
@@ -436,22 +452,22 @@ class NewsVerification:
         """
         Highlights and assigns entities with colors to aligned sentences
             based on grouped URLs.
         For each grouped URL:
         1. Highlights entities in the input and source text
-        2. Then assigns these highlighted entities to the corresponding
             sentences in the aligned sentences DataFrame.
         """
         entities_with_colors = []
-        for index, row in self.grouped_url_df.iterrows():
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
                 row["input"],
                 row["source"],
             )
-            # Assign the highlighted entities to the corresponding sentences
             # in aligned_sentences_df.
             for index, sentence in self.aligned_sentences_df.iterrows():
                 if sentence["url"] == row["url"]:
@@ -468,440 +484,3 @@ class NewsVerification:
             set: A set containing the unique URLs referenced in the text.
         """
         return set(self.text_referent_url)
-    def create_fact_checker_table(self):
-        rows = []
-        rows.append(self.format_image_fact_checker_row())
-        for _, row in self.aligned_sentences_df.iterrows():
-            if row["input"] is None:
-                continue
-            if row["source"] is None:
-                equal_idx_1 = equal_idx_2 = []
-            else:  # Get index of equal phrases in input and source sentences
-                equal_idx_1, equal_idx_2 = extract_equal_text(
-                    row["input"],
-                    row["source"],
-                )
-            self.fact_checker_table.append(
-                [
-                    row,
-                    equal_idx_1,
-                    equal_idx_2,
-                    row["entities"],
-                    row["url"],
-                ],
-            )
-        previous_url = None
-        span_row = 1
-        for index, row in enumerate(self.fact_checker_table):
-            current_url = row[4]
-            last_url_row = False
-            # First row or URL change
-            if index == 0 or current_url != previous_url:
-                first_url_row = True
-                previous_url = current_url
-                # Increase counter "span_row" when the next url is the same
-                while (
-                    index + span_row < len(self.fact_checker_table)
-                    and self.fact_checker_table[index + span_row][4]
-                    == current_url
-                ):
-                    span_row += 1
-            else:
-                first_url_row = False
-                span_row -= 1
-            if span_row == 1:
-                last_url_row = True
-            formatted_row = self.format_text_fact_checker_row(
-                row,
-                first_url_row,
-                last_url_row,
-                span_row,
-            )
-            rows.append(formatted_row)
-        table = "\n".join(rows)
-        return f"""
-<h5>Comparison between input news and source news:</h5>
-<table border="1" style="width:100%; text-align:left;">
-<col style="width: 170px;">
-<col style="width: 170px;">
-<col style="width: 30px;">
-<col style="width: 75px;">
-    <thead>
-        <tr>
-            <th>Input news</th>
-            <th>Source (URL in Originality)</th>
-            <th>Forensic</th>
-            <th>Originality</th>
-        </tr>
-    </thead>
-    <tbody>
-        {table}
-    </tbody>
-</table>
-<style>
-"""
-    def format_text_fact_checker_row(
-        self,
-        row,
-        first_url_row=True,
-        last_url_row=True,
-        span_row=1,
-    ):
-        entity_count = 0
-        if row[0]["input"] is None:
-            return ""
-        if row[0]["source"] is not None:  # source is not empty
-            if row[3] is not None:
-                # highlight entities
-                input_sentence, highlight_idx_input = apply_highlight(
-                    row[0]["input"],
-                    row[3],
-                    "input",
-                )
-                source_sentence, highlight_idx_source = apply_highlight(
-                    row[0]["source"],
-                    row[3],
-                    "source",
-                )
-            else:
-                input_sentence = row[0]["input"]
-                source_sentence = row[0]["source"]
-                highlight_idx_input = []
-                highlight_idx_source = []
-            if row[3] is not None:
-                entity_count = len(row[3])
-            # Color overlapping words
-            input_sentence = color_text(
-                input_sentence,
-                row[1],
-                highlight_idx_input,
-            )  # text, index of highlight words
-            source_sentence = color_text(
-                source_sentence,
-                row[2],
-                highlight_idx_source,
-            )  # text, index of highlight words
-            # Replace _ to get correct formatting
-            # Original one having _ for correct word counting
-            input_sentence = input_sentence.replace(
-                "span_style",
-                "span style",
-            ).replace("1px_4px", "1px 4px")
-            source_sentence = source_sentence.replace(
-                "span_style",
-                "span style",
-            ).replace("1px_4px", "1px 4px")
-        else:
-            input_sentence = row[0]["input"]
-            source_sentence = row[0]["source"]
-        url = row[0]["url"]
-        # Displayed label and score by url
-        filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
-        if len(filterby_url) > 0:
-            label = filterby_url["label"].values[0]
-            score = filterby_url["score"].values[0]
-        else:
-            label = self.text_prediction_label[0]
-            score = self.text_prediction_score[0]
-        # Format displayed url
-        source_text_url = f"""<a href="{url}">{url}</a>"""
-        # Format displayed entity count
-        entity_count_text = format_entity_count(entity_count)
-        border_top = "border-top: 1px solid transparent;"
-        border_bottom = "border-bottom: 1px solid transparent;"
-        word_break = "word-break: break-all;"
-        if first_url_row is True:
-            # First & Last the group: no transparent
-            if last_url_row is True:
-                return f"""
-<tr>
-    <td>{input_sentence}</td>
-    <td>{source_sentence}</td>
-    <td rowspan="{span_row}">{label}<br>
-    ({score * 100:.2f}%)<br><br>
-    {entity_count_text}</td>
-    <td rowspan="{span_row}"; style="{word_break}";>{source_text_url}</td>
-</tr>
-"""
-            # First row of the group: transparent bottom border
-            return f"""
-<tr>
-    <td style="{border_bottom}";>{input_sentence}</td>
-    <td style="{border_bottom}";>{source_sentence}</td>
-    <td rowspan="{span_row}">{label}<br>
-    ({score * 100:.2f}%)<br><br>
-    {entity_count_text}</td>
-    <td rowspan="{span_row}"; style="{word_break}";>{source_text_url}</td>
-</tr>
-"""
-        else:
-            if last_url_row is True:
-                # NOT First row, Last row: transparent top border
-                return f"""
-<tr>
-    <td style="{border_top}";>{input_sentence}</td>
-    <td style="{border_top}";>{source_sentence}</td>
-</tr>
-"""
-            else:
-                # NOT First & NOT Last row: transparent top & bottom borders
-                return f"""
-<tr>
-    <td style="{border_top} {border_bottom}";>{input_sentence}</td>
-    <td style="{border_top} {border_bottom}";>{source_sentence}</td>
-</tr>
-"""
-    def format_image_fact_checker_row(self):
-        if (
-            self.image_referent_url is not None
-            or self.image_referent_url != ""
-        ):
-            source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">"""  # noqa: E501
-            source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>"""  # noqa: E501
-        else:
-            source_image = "Image not found"
-            source_image_url = ""
-        word_break = "word-break: break-all;"
-        return f"""
-    <tr>
-        <td>input image</td>
-        <td>{source_image}</td>
-        <td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td>
-        <td style="{word_break}";>{source_image_url}</td></tr>"""
-    def create_ordinary_user_table(self):
-        rows = []
-        rows.append(self.format_image_ordinary_user_row())
-        rows.append(self.format_text_ordinary_user_row())
-        table = "\n".join(rows)
-        return f"""
-<h5>Comparison between input news and source news:</h5>
-<table border="1" style="width:100%; text-align:left;">
-<col style="width: 340px;">
-<col style="width: 30px;">
-<col style="width: 75px;">
-    <thead>
-        <tr>
-            <th>Input news</th>
-            <th>Forensic</th>
-            <th>Originality</th>
-        </tr>
-    </thead>
-    <tbody>
-        {table}
-    </tbody>
-</table>
-<style>
-    """
-    def format_text_ordinary_user_row(self):
-        input_sentences = ""
-        source_text_urls = ""
-        urls = []
-        for _, row in self.aligned_sentences_df.iterrows():
-            if row["input"] is None:
-                continue
-            input_sentences += row["input"] + "<br><br>"
-            url = row["url"]
-            if url not in urls:
-                urls.append(url)
-                source_text_urls += f"""<a href="{url}">{url}</a><br>"""
-        word_break = "word-break: break-all;"
-        return f"""
-                <tr>
-                    <td>{input_sentences}</td>
-                    <td>{self.text_prediction_label[0]}<br>
-                    ({self.text_prediction_score[0] * 100:.2f}%)</td>
-                    <td style="{word_break}";>{source_text_urls}</td>
-                </tr>
-                """
-    def format_image_ordinary_user_row(self):
-        if (
-            self.image_referent_url is not None
-            or self.image_referent_url != ""
-        ):
-            source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>"""  # noqa: E501
-        else:
-            source_image_url = ""
-        word_break = "word-break: break-all;"
-        return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>"""  # noqa: E501
-    def create_governor_table(self):
-        rows = []
-        rows.append(self.format_image_governor_row())
-        for _, row in self.aligned_sentences_df.iterrows():
-            if row["input"] is None:
-                continue
-            if row["source"] is None:
-                equal_idx_1 = equal_idx_2 = []
-            else:
-                # Get index of equal phrases in input and source sentences
-                equal_idx_1, equal_idx_2 = extract_equal_text(
-                    row["input"],
-                    row["source"],
-                )
-            self.governor_table.append(
-                [
-                    row,
-                    equal_idx_1,
-                    equal_idx_2,
-                    row["entities"],
-                ],
-            )
-        formatted_row = self.format_text_governor_row()
-        rows.append(formatted_row)
-        table = "\n".join(rows)
-        return f"""
-<h5>Comparison between input news and source news:</h5>
-<table border="1" style="width:100%; text-align:left;">
-<col style="width: 170px;">
-<col style="width: 170px;">
-<col style="width: 30px;">
-<col style="width: 75px;">
-    <thead>
-        <tr>
-            <th>Input news</th>
-            <th>Source (URL in Originality)</th>
-            <th>Forensic</th>
-            <th>Originality</th>
-        </tr>
-    </thead>
-    <tbody>
-        {table}
-    </tbody>
-</table>
-<style>
-        """
-    def format_text_governor_row(self):
-        input_sentences = ""
-        source_sentences = ""
-        source_text_urls = ""
-        urls = []
-        sentence_count = 0
-        entity_count = [0, 0]  # to get index of [-2]
-        for row in self.governor_table:
-            if row[0]["input"] is None:
-                continue
-            if row[0]["source"] is not None:  # source is not empty
-                # highlight entities
-                input_sentence, highlight_idx_input = apply_highlight(
-                    row[0]["input"],
-                    row[3],  # entities_with_colors
-                    "input",  # key
-                    entity_count[
-                        -2
-                    ],  # since the last one is for current counting
-                )
-                source_sentence, highlight_idx_source = apply_highlight(
-                    row[0]["source"],
-                    row[3],  # entities_with_colors
-                    "source",  # key
-                    entity_count[
-                        -2
-                    ],  # since the last one is for current counting
-                )
-                # Color overlapping words
-                input_sentence = color_text(
-                    input_sentence,
-                    row[1],
-                    highlight_idx_input,
-                )  # text, index of highlight words
-                source_sentence = color_text(
-                    source_sentence,
-                    row[2],
-                    highlight_idx_source,
-                )  # text, index of highlight words
-                input_sentence = input_sentence.replace(
-                    "span_style",
-                    "span style",
-                ).replace("1px_4px", "1px 4px")
-                source_sentence = source_sentence.replace(
-                    "span_style",
-                    "span style",
-                ).replace("1px_4px", "1px 4px")
-            else:
-                if row[0]["source"] is None:
-                    source_sentence = ""
-                else:
-                    source_sentence = row[0]["source"]
-                input_sentence = row[0]["input"]
-            # convert score to HUMAN-based score:
-            input_sentences += input_sentence + "<br><br>"
-            source_sentences += source_sentence + "<br><br>"
-            url = row[0]["url"]
-            if url not in urls:
-                urls.append(url)
-                source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
-                sentence_count += 1
-                if row[3] is not None:
-                    entity_count.append(len(row[3]))
-        entity_count_text = format_entity_count(sum(entity_count))
-        word_break = "word-break: break-all;"
-        return f"""
-<tr>
-    <td>{input_sentences}</td>
-    <td>{source_sentences}</td>
-    <td>{self.text_prediction_label[0]}<br>
-        ({self.text_prediction_score[0] * 100:.2f}%)<br><br>
-        {entity_count_text}</td>
-    <td style="{word_break}";>{source_text_urls}</td>
-</tr>
-                """
-    def format_image_governor_row(self):
-        if (
-            self.image_referent_url is not None
-            or self.image_referent_url != ""
-        ):
-            source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">"""  # noqa: E501
-            source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>"""  # noqa: E501
-        else:
-            source_image = "Image not found"
-            source_image_url = ""
-        word_break = "word-break: break-all;"
-        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>"""  # noqa: E501

 import pandas as pd
+from src.application.config import (
+    MIN_RATIO_PARAPHRASE_NUM,
+    PARAPHRASE_THRESHOLD,
+    PARAPHRASE_THRESHOLD_MACHINE,
+)
+from src.application.formatting_fact_checker import create_fact_checker_table
+from src.application.formatting_governor import create_governor_table
+from src.application.formatting_ordinary_user import create_ordinary_user_table
+from src.application.image.image import ImageDetector
 from src.application.image.image_detection import (
     detect_image_by_ai_model,
     detect_image_by_reverse_search,
     detect_image_from_news_image,
 )
+from src.application.text.entity import highlight_entities
 from src.application.text.helper import (
     postprocess_label,
     split_into_paragraphs,
 )
     predict_generation_model,
 )
 from src.application.text.search_detection import find_sentence_source
+from src.application.text.text import TextDetector
 class NewsVerification:
         self.news_content: str = ""
         self.news_image: str = ""
+        self.text = TextDetector()
+        self.image = ImageDetector()
         self.news_prediction_label: str = ""
         self.news_prediction_score: float = -1
                 # "entities",
             ],
         )
     def load_news(self, news_title: str, news_content: str, news_image: str):
         """
             )  # Handle mixed data types and NaNs
         # Group sentences by URL and concatenate 'input' and 'source' text.
+        self.text.grouped_url_df = (
             self.aligned_sentences_df.groupby("url")
             .agg(
                 {
         )  # Reset index to make 'url' a regular column
         # Add new columns for label and score
+        self.text.grouped_url_df["label"] = None
+        self.text.grouped_url_df["score"] = None
         print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
         """
         Determines the text origin for each URL group.
         """
+        for index, row in self.text.grouped_url_df.iterrows():
             # Verify text origin using URL-based verification.
             label, score = self.verify_text(row["url"])
                 # Detect text origin using an AI model.
                 label, score = detect_text_by_ai_model(text)
+            self.text.grouped_url_df.at[index, "label"] = label
+            self.text.grouped_url_df.at[index, "score"] = score
     def determine_text_origin(self):
         """
         self.determine_text_origin_by_url()
         # Determine the overall label and score for the entire input text.
+        if not self.text.grouped_url_df.empty:
             # Check for 'gpt-4o' labels in the grouped URLs.
+            machine_label = self.text.grouped_url_df[
+                self.text.grouped_url_df["label"].str.contains(
                     "gpt-4o",
                     case=False,
                     na=False,
                 # labels = " and ".join(machine_label["label"].tolist())
                 # label = remove_duplicate_words(label)
+                self.text.prediction_label[0] = label
+                self.text.prediction_score[0] = machine_label["score"].mean()
             else:
                 # If no 'gpt-4o' labels, assign for 'HUMAN' labels.
                 machine_label = self.aligned_sentences_df[
                     self.aligned_sentences_df["label"] == "HUMAN"
                 ]
+                self.text.prediction_label[0] = "HUMAN"
+                self.text.prediction_score[0] = machine_label["score"].mean()
         else:
             # If no found URLs, use AI detection on the entire input text.
             print("No source found in the input text")
             # Detect text origin using an AI model.
             label, score = detect_text_by_ai_model(text)
+            self.text.prediction_label[0] = label
+            self.text.prediction_score[0] = score
     def find_text_source(self):
         """
         Determines the origin of the given text based on paraphrasing
             detection and human authorship analysis.
+        1. Splits the input news text into sentences,
         2. Searches for sources for each sentence
         3. Updates the aligned_sentences_df with the found sources.
         """
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         input_paragraphs = split_into_paragraphs(self.news_text)
+        # Initialize an empty DataFrame if it doesn't exist,
+        # otherwise extend it.
+        if (
+            not hasattr(self, "aligned_sentences_df")
+            or self.aligned_sentences_df is None
+        ):
+            self.aligned_sentences_df = pd.DataFrame(
+                columns=[
+                    "input",
+                    "source",
+                    "label",
+                    "similarity",
+                    "paraphrase",
+                    "url",
+                    "entities",
+                ],
+            )
         # Setup DataFrame for input_sentences
         for _ in range(len(input_paragraphs)):
                 index,
                 self.aligned_sentences_df,
             )
             # Initialize found_img_url if it does not exist.
+            if not hasattr(self, "found_img_url"):
                 self.found_img_url = []
             self.found_img_url.extend(img_urls)
     def verify_text(self, url):
         """
+        Verifies the text origin based on similarity scores and labels
             associated with a given URL.
+        1. Filters sentences by URL and similarity score,
+        2. Determines if the text is likely generated by a machine or a human.
         3. Calculates an average similarity score.
         Args:
         Returns:
             tuple: A
+                - Label ("MACHINE", "HUMAN", or "UNKNOWN")
                 - Score
         """
         label = "UNKNOWN"
         score = 0
         # calculate the average similarity when the similary score
         # in each row of sentences_df is higher than 0.8
         # Filter sentences by URL.
         filtered_by_url = self.aligned_sentences_df[
             self.aligned_sentences_df["url"] == url
         ]
         # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
         filtered_by_similarity = filtered_by_url[
             filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
         ]
         # Check if a ratio of remaining filtering-sentences is more than 50%.
+        if (
+            len(filtered_by_similarity) / len(self.aligned_sentences_df)
+            > MIN_RATIO_PARAPHRASE_NUM
+        ):
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
             contains_machine = (
                 filtered_by_similarity["label"]
                 )
                 .any()
             )
             # TODO: integrate with determine_text_origin
             if contains_machine:
                 # If "MACHINE" label is present, set label and calculate score.
                 label = f"Partially generated by {generated_model}"
                 score = machine_rows["similarity"].mean()
             else:
+                # If no "MACHINE" label,
+                # assign "HUMAN" label and calculate score.
                 label = "HUMAN"
                 human_rows = filtered_by_similarity[
                     filtered_by_similarity["label"].str.contains(
     def determine_image_origin(self):
         """
+        Determines the origin of the news image using 3 detection methods.
         1.  Matching against previously found image URLs.
         2.  Reverse image search.
         3.  AI-based image detection.
+        If none of these methods succeed, the image origin is "UNKNOWN".
         """
         print("CHECK IMAGE:")
         # Handle the case where no image is provided.
         if self.news_image is None:
+            self.image.prediction_label = "UNKNOWN"
+            self.image.prediction_score = 0.0
+            self.image.referent_url = None
             return
         # Attempt to match the image against previously found image URLs.
         )
         if matched_url is not None:
             print(f"matched image: {matched_url}\nsimilarity: {similarity}\n")
+            self.image.prediction_label = "HUMAN"
+            self.image.prediction_score = similarity
+            self.image.referent_url = matched_url
             return
         # Attempt to find the image origin using reverse image search.
         )
         if matched_url is not None:
             print(f"matched image: {matched_url}\tScore: {similarity}%\n")
+            self.image.prediction_label = "HUMAN"
+            self.image.prediction_score = similarity
+            self.image.referent_url = matched_url
             return
         # Attempt to detect the image origin using an AI model.
         detected_label, score = detect_image_by_ai_model(self.news_image)
         if detected_label:
             print(f"detected_label: {detected_label} ({score})")
+            self.image.prediction_label = detected_label
+            self.image.prediction_score = score
+            self.image.referent_url = None
             return
         # If all detection methods fail, mark the image origin as "UNKNOWN".
+        self.image.prediction_label = "UNKNOWN"
+        self.image.prediction_score = 50
+        self.image.referent_url = None
     def determine_origin(self):
         """
             self.determine_text_origin()
         if self.news_image != "":
             self.determine_image_origin()
         # Handle entity recognition and processing.
         self.handle_entities()
     def generate_report(self) -> tuple[str, str, str]:
         """
+        Generates reports tailored for different user roles
             (ordinary users, fact checkers, governors).
         Returns:
                 - fact_checker_table: Report for fact checkers.
                 - governor_table: Report for governors.
         """
+        ordinary_user_table = create_ordinary_user_table(
+            self.aligned_sentences_df,
+            self.text,
+            self.image,
+        )
+        fact_checker_table = create_fact_checker_table(
+            self.aligned_sentences_df,
+            self.text,
+            self.image,
+        )
+        governor_table = create_governor_table(
+            self.aligned_sentences_df,
+            self.text,
+            self.image,
+        )
         return ordinary_user_table, fact_checker_table, governor_table
         """
         Highlights and assigns entities with colors to aligned sentences
             based on grouped URLs.
         For each grouped URL:
         1. Highlights entities in the input and source text
+        2. Then assigns these highlighted entities to the corresponding
             sentences in the aligned sentences DataFrame.
         """
         entities_with_colors = []
+        for index, row in self.text.grouped_url_df.iterrows():
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
                 row["input"],
                 row["source"],
             )
+            # Assign the highlighted entities to the corresponding sentences
             # in aligned_sentences_df.
             for index, sentence in self.aligned_sentences_df.iterrows():
                 if sentence["url"] == row["url"]:
             set: A set containing the unique URLs referenced in the text.
         """
         return set(self.text_referent_url)

src/application/content_generation.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import openai
 import pandas as pd
@@ -100,7 +101,7 @@ def extract_title_content(fake_news: str) -> tuple[str, str]:
 def generate_fake_image(
     title: str,
     model: str = GPT_IMAGE_MODEL,
-) -> str | None:
     """
     Generates a fake image URL using Azure OpenAI's image generation API.

 import json
+from typing import Optional
 import openai
 import pandas as pd
 def generate_fake_image(
     title: str,
     model: str = GPT_IMAGE_MODEL,
+) -> Optional[str]:
     """
     Generates a fake image URL using Azure OpenAI's image generation API.

src/application/formatting.py CHANGED Viewed

@@ -1,18 +1,26 @@
-from src.application.text.helper import extract_starts_ends, filter_indices
-def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -> str:
     """
     Colors specific words in a text based on provided indices.
-    This method takes a text, a list of indices to color, and a list of indices to exclude.
-    It splits the text into words, filters the indices, and then wraps the words within
-    the specified ranges with a green span tag for coloring.
     Args:
         text (str): The input text.
-        colored_idx (list): A list of dictionaries, where each dictionary contains
-                            'start' and 'end' keys representing indices of words to color.
         highlighted_idx (list): A list of indices to exclude from coloring.
     Returns:
@@ -23,7 +31,7 @@ def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -
     # Extract start and end indices from colored_idx.
     starts, ends = extract_starts_ends(colored_idx)
     # Filter the start and end indices to exclude highlighted_idx.
     starts, ends = filter_indices(starts, ends, highlighted_idx)
@@ -64,4 +72,4 @@ def format_entity_count(entity_count: int) -> str:
         entity_count_text = "with 1 altered entity"
     else:
         entity_count_text = "with altered entities"
-    return entity_count_text

+from src.application.text.helper import (
+    extract_starts_ends,
+    filter_indices,
+)
+def color_text(
+    text: str,
+    colored_idx: list[dict],
+    highlighted_idx: list[int],
+) -> str:
     """
     Colors specific words in a text based on provided indices.
+    1. splits the text into words
+    2. filters the indices
+    3. wraps the words within the specified ranges with a coloring tag
     Args:
         text (str): The input text.
+        colored_idx (list): A list of dictionaries,
+            where each dictionary contains
+            'start' and 'end' keys representing indices of words to color.
         highlighted_idx (list): A list of indices to exclude from coloring.
     Returns:
     # Extract start and end indices from colored_idx.
     starts, ends = extract_starts_ends(colored_idx)
     # Filter the start and end indices to exclude highlighted_idx.
     starts, ends = filter_indices(starts, ends, highlighted_idx)
         entity_count_text = "with 1 altered entity"
     else:
         entity_count_text = "with altered entities"
+    return entity_count_text

src/application/formatting_fact_checker.py ADDED Viewed

	@@ -0,0 +1,238 @@

+from pandas import DataFrame
+from src.application.config import WORD_BREAK
+from src.application.formatting import (
+    color_text,
+    format_entity_count,
+)
+from src.application.image.image import ImageDetector
+from src.application.text.entity import apply_highlight
+from src.application.text.helper import extract_equal_text
+from src.application.text.text import TextDetector
+def create_fact_checker_table(
+    aligned_sentences_df: DataFrame,
+    text: TextDetector,
+    image: ImageDetector,
+):
+    rows = []
+    rows.append(format_image_fact_checker_row(image))
+    for _, row in aligned_sentences_df.iterrows():
+        if row["input"] is None:
+            continue
+        if row["source"] is None:
+            equal_idx_1 = equal_idx_2 = []
+        else:  # Get index of equal phrases in input and source sentences
+            equal_idx_1, equal_idx_2 = extract_equal_text(
+                row["input"],
+                row["source"],
+            )
+        text.fact_checker_table.append(
+            [
+                row,  # aligned_sentences_df
+                equal_idx_1,  # index of equal text in input
+                equal_idx_2,  # index of equal text in source
+                row["entities"],
+                row["url"],
+            ],
+        )
+    previous_url = None
+    span_row = 1
+    for index, row in enumerate(text.fact_checker_table):
+        current_url = row[4]
+        last_url_row = False
+        # First row or URL change
+        if index == 0 or current_url != previous_url:
+            first_url_row = True
+            previous_url = current_url
+            # Increase counter "span_row" when the next url is the same
+            while (
+                index + span_row < len(text.fact_checker_table)
+                and text.fact_checker_table[index + span_row][4] == current_url
+            ):
+                span_row += 1
+        else:
+            first_url_row = False
+            span_row -= 1
+        if span_row == 1:
+            last_url_row = True
+        formatted_row = format_text_fact_checker_row(
+            text,
+            row,
+            first_url_row,
+            last_url_row,
+            span_row,
+        )
+        rows.append(formatted_row)
+    table = "\n".join(rows)
+    return f"""
+<h5>Comparison between input news and source news:</h5>
+<table border="1" style="width:100%; text-align:left;">
+<col style="width: 170px;">
+<col style="width: 170px;">
+<col style="width: 30px;">
+<col style="width: 75px;">
+    <thead>
+        <tr>
+            <th>Input news</th>
+            <th>Source (URL in Originality)</th>
+            <th>Forensic</th>
+            <th>Originality</th>
+        </tr>
+    </thead>
+    <tbody>
+        {table}
+    </tbody>
+</table>
+<style>
+"""
+def format_text_fact_checker_row(
+    text: TextDetector,
+    row: list,
+    first_url_row: bool=True,
+    last_url_row: bool=True,
+    span_row: int=1,
+):
+    entity_count = 0
+    print(f"row: {row}")
+    if row[0]["input"] is None:
+        return ""
+    if row[0]["source"] is not None:  # source is not empty
+        if row[3] is not None:
+            # highlight entities
+            input_sentence, highlight_idx_input = apply_highlight(
+                row[0]["input"],
+                row[3],
+                "input",
+            )
+            source_sentence, highlight_idx_source = apply_highlight(
+                row[0]["source"],
+                row[3],
+                "source",
+            )
+        else:
+            input_sentence = row[0]["input"]
+            source_sentence = row[0]["source"]
+            highlight_idx_input = []
+            highlight_idx_source = []
+        if row[3] is not None:
+            entity_count = len(row[3])
+        # Color overlapping words
+        input_sentence = color_text(
+            input_sentence,
+            row[1],
+            highlight_idx_input,
+        )  # text, index of highlight words
+        source_sentence = color_text(
+            source_sentence,
+            row[2],
+            highlight_idx_source,
+        )  # text, index of highlight words
+        # Replace _ to get correct formatting
+        # Original one having _ for correct word counting
+        input_sentence = input_sentence.replace(
+            "span_style",
+            "span style",
+        ).replace("1px_4px", "1px 4px")
+        source_sentence = source_sentence.replace(
+            "span_style",
+            "span style",
+        ).replace("1px_4px", "1px 4px")
+    else:
+        input_sentence = row[0]["input"]
+        source_sentence = row[0]["source"]
+    url = row[0]["url"]
+    # Displayed label and score by url
+    filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
+    if len(filterby_url) > 0:
+        label = filterby_url["label"].values[0]
+        score = filterby_url["score"].values[0]
+    else:
+        label = text.prediction_label[0]
+        score = text.prediction_score[0]
+    # Format displayed url
+    source_text_url = f"""<a href="{url}">{url}</a>"""
+    # Format displayed entity count
+    entity_count_text = format_entity_count(entity_count)
+    border_top = "border-top: 1px solid transparent;"
+    border_bottom = "border-bottom: 1px solid transparent;"
+    if first_url_row is True:
+        # First & Last the group: no transparent
+        if last_url_row is True:
+            return f"""
+<tr>
+    <td>{input_sentence}</td>
+    <td>{source_sentence}</td>
+    <td rowspan="{span_row}">{label}<br>
+    ({score * 100:.2f}%)<br><br>
+    {entity_count_text}</td>
+    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
+</tr>
+"""
+        # First row of the group: transparent bottom border
+        return f"""
+<tr>
+    <td style="{border_bottom}";>{input_sentence}</td>
+    <td style="{border_bottom}";>{source_sentence}</td>
+    <td rowspan="{span_row}">{label}<br>
+    ({score * 100:.2f}%)<br><br>
+    {entity_count_text}</td>
+    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
+</tr>
+"""
+    else:
+        if last_url_row is True:
+            # NOT First row, Last row: transparent top border
+            return f"""
+<tr>
+    <td style="{border_top}";>{input_sentence}</td>
+    <td style="{border_top}";>{source_sentence}</td>
+</tr>
+"""
+        else:
+            # NOT First & NOT Last row: transparent top & bottom borders
+            return f"""
+<tr>
+    <td style="{border_top} {border_bottom}";>{input_sentence}</td>
+    <td style="{border_top} {border_bottom}";>{source_sentence}</td>
+</tr>
+"""
+def format_image_fact_checker_row(image):
+    if image.referent_url is not None or image.referent_url != "":
+        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
+        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
+    else:
+        source_image = "Image not found"
+        source_image_url = ""
+    return f"""
+<tr>
+    <td>input image</td>
+    <td>{source_image}</td>
+    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
+    <td style="{WORD_BREAK}";>{source_image_url}</td>
+</tr>
+"""

src/application/formatting_governor.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from pandas import DataFrame
+from src.application.config import WORD_BREAK
+from src.application.formatting import (
+    color_text,
+    format_entity_count,
+)
+from src.application.image.image import ImageDetector
+from src.application.text.entity import apply_highlight
+from src.application.text.helper import extract_equal_text
+from src.application.text.text import TextDetector
+def create_governor_table(
+    aligned_sentences_df: DataFrame,
+    text: TextDetector,
+    image: ImageDetector,
+):
+    rows = []
+    rows.append(format_image_governor_row(image))
+    for _, row in aligned_sentences_df.iterrows():
+        if row["input"] is None:
+            continue
+        if row["source"] is None:
+            equal_idx_1 = equal_idx_2 = []
+        else:
+            # Get index of equal phrases in input and source sentences
+            equal_idx_1, equal_idx_2 = extract_equal_text(
+                row["input"],
+                row["source"],
+            )
+        text.governor_table.append(
+            [
+                row,
+                equal_idx_1,
+                equal_idx_2,
+                row["entities"],
+            ],
+        )
+    formatted_row = format_text_governor_row(text)
+    rows.append(formatted_row)
+    table = "\n".join(rows)
+    return f"""
+<h5>Comparison between input news and source news:</h5>
+<table border="1" style="width:100%; text-align:left;">
+<col style="width: 170px;">
+<col style="width: 170px;">
+<col style="width: 30px;">
+<col style="width: 75px;">
+    <thead>
+        <tr>
+            <th>Input news</th>
+            <th>Source (URL in Originality)</th>
+            <th>Forensic</th>
+            <th>Originality</th>
+        </tr>
+    </thead>
+    <tbody>
+        {table}
+    </tbody>
+</table>
+<style>
+        """
+def format_text_governor_row(text):
+    input_sentences = ""
+    source_sentences = ""
+    source_text_urls = ""
+    urls = []
+    sentence_count = 0
+    entity_count = [0, 0]  # to get index of [-2]
+    for row in text.governor_table:
+        if row[0]["input"] is None:
+            continue
+        if row[0]["source"] is not None:  # source is not empty
+            # highlight entities
+            input_sentence, highlight_idx_input = apply_highlight(
+                row[0]["input"],
+                row[3],  # entities_with_colors
+                "input",  # key
+                entity_count[-2],  # since the last one is for current counting
+            )
+            source_sentence, highlight_idx_source = apply_highlight(
+                row[0]["source"],
+                row[3],  # entities_with_colors
+                "source",  # key
+                entity_count[-2],  # since the last one is for current counting
+            )
+            # Color overlapping words
+            input_sentence = color_text(
+                input_sentence,
+                row[1],
+                highlight_idx_input,
+            )  # text, index of highlight words
+            source_sentence = color_text(
+                source_sentence,
+                row[2],
+                highlight_idx_source,
+            )  # text, index of highlight words
+            input_sentence = input_sentence.replace(
+                "span_style",
+                "span style",
+            ).replace("1px_4px", "1px 4px")
+            source_sentence = source_sentence.replace(
+                "span_style",
+                "span style",
+            ).replace("1px_4px", "1px 4px")
+        else:
+            if row[0]["source"] is None:
+                source_sentence = ""
+            else:
+                source_sentence = row[0]["source"]
+            input_sentence = row[0]["input"]
+        # convert score to HUMAN-based score:
+        input_sentences += input_sentence + "<br><br>"
+        source_sentences += source_sentence + "<br><br>"
+        url = row[0]["url"]
+        if url not in urls:
+            urls.append(url)
+            source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
+            sentence_count += 1
+            if row[3] is not None:
+                entity_count.append(len(row[3]))
+    entity_count_text = format_entity_count(sum(entity_count))
+    return f"""
+<tr>
+    <td>{input_sentences}</td>
+    <td>{source_sentences}</td>
+    <td>{text.prediction_label[0]}<br>
+        ({text.prediction_score[0] * 100:.2f}%)<br><br>
+        {entity_count_text}</td>
+    <td style="{WORD_BREAK}";>{source_text_urls}</td>
+</tr>
+"""
+def format_image_governor_row(image):
+    if image.referent_url is not None or image.referent_url != "":
+        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
+        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
+    else:
+        source_image = "Image not found"
+        source_image_url = ""
+    return f"""
+<tr>
+    <td>input image</td>
+    <td>{source_image}</td>
+    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
+    <td style="{WORD_BREAK}";>{source_image_url}</td>
+</tr>"""

src/application/formatting_ordinary_user.py CHANGED Viewed

@@ -1,10 +1,18 @@
 from src.application.config import WORD_BREAK
-def create_ordinary_user_table(self):
     rows = []
-    rows.append(self.format_image_ordinary_user_row())
-    rows.append(self.format_text_ordinary_user_row())
     table = "\n".join(rows)
     return f"""
@@ -28,60 +36,56 @@ def create_ordinary_user_table(self):
 <style>
     """
-def format_text_ordinary_user_row(self):
     input_sentences = ""
-    source_text_urls = ""
     urls = []
-    for _, row in self.aligned_sentences_df.iterrows():
         if row["input"] is None:
             continue
         input_sentences += row["input"] + "<br><br>"
         url = row["url"]
         if url not in urls:
             urls.append(url)
-            source_text_urls += f"""<a href="{url}">{url}</a><br>"""
     return f"""
             <tr>
                 <td>{input_sentences}</td>
-                <td>{self.text_prediction_label[0]}<br>
-                ({self.text_prediction_score[0] * 100:.2f}%)</td>
-                <td style="{WORD_BREAK}";>{source_text_urls}</td>
             </tr>
             """
-def format_image_ordinary_user_row(
-    image_referent_url: str,
-    image_prediction_label: str,
-    image_prediction_score: float,
-):
     """
-    Formats an HTML table row for ordinary users,
         displaying image analysis results.
     Args:
-        image_referent_url (str): The URL of the referenced image.
-        image_prediction_label (str): The predicted label for the image.
-        image_prediction_score (float): The prediction score for the image.
     Returns:
         str: An HTML table row string containing the image analysis results.
     """
     # Put image, label, and score into html tag
-    if (
-        image_referent_url is not None
-        or image_referent_url != ""
-    ):
-        source_image_url = f"""<a href="{image_referent_url}">{image_referent_url}</a>"""  # noqa: E501
     else:
-        source_image_url = ""
     return f"""
 <tr>
     <td>input image</td>
-    <td>{image_prediction_label}<br>({image_prediction_score:.2f}%)</td>
-    <td style="{WORD_BREAK}";>{source_image_url}</td>
 </tr>
-"""

+from pandas import DataFrame
 from src.application.config import WORD_BREAK
+from src.application.image.image import ImageDetector
+from src.application.text.text import TextDetector
+def create_ordinary_user_table(
+    aligned_sentences_df: DataFrame,
+    text: TextDetector,
+    image: ImageDetector,
+) -> str:
     rows = []
+    rows.append(format_image_ordinary_user_row(image))
+    rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
     table = "\n".join(rows)
     return f"""
 <style>
     """
+def format_text_ordinary_user_row(
+    aligned_sentences_df,
+    text,
+) -> str:
     input_sentences = ""
+    source_text_html = ""
     urls = []
+    for _, row in aligned_sentences_df.iterrows():
         if row["input"] is None:
             continue
         input_sentences += row["input"] + "<br><br>"
         url = row["url"]
         if url not in urls:
             urls.append(url)
+            source_text_html += f"""<a href="{url}">{url}</a><br>"""
     return f"""
             <tr>
                 <td>{input_sentences}</td>
+                <td>{text.prediction_label[0]}<br>
+                ({text.prediction_score[0] * 100:.2f}%)</td>
+                <td style="{WORD_BREAK}";>{source_text_html}</td>
             </tr>
             """
+def format_image_ordinary_user_row(image: ImageDetector) -> str:
     """
+    Formats an HTML table row for ordinary users,
         displaying image analysis results.
     Args:
+        image (ImageDetector): The image to be analyzed.
     Returns:
         str: An HTML table row string containing the image analysis results.
     """
     # Put image, label, and score into html tag
+    if image.referent_url is not None or image.referent_url != "":
+        source_image_html = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
     else:
+        source_image_html = ""
     return f"""
 <tr>
     <td>input image</td>
+    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
+    <td style="{WORD_BREAK}";>{source_image_html}</td>
 </tr>
+"""

src/application/image/image.py ADDED Viewed

	@@ -0,0 +1,5 @@

+class ImageDetector:
+    def __init__(self):
+        self.referent_url: str = None  # URL of the referenced image.
+        self.prediction_label: str = None
+        self.prediction_score: float = None

src/application/text/helper.py CHANGED Viewed

@@ -296,11 +296,11 @@ def postprocess_label(labels: list[str]) -> str:
     prefix = "Partially generated by "
     for index, label in enumerate(labels):
         if label.startswith(prefix):
-            labels[index] = label[len(prefix):]
     labels = list(set(labels))
     label = prefix
     if len(labels) == 1:
         label += labels[0]
     elif len(labels) == 2:
@@ -371,12 +371,14 @@ def split_into_paragraphs(input_text: str) -> list[str]:
     return out_paragraphs
-def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
     """
     Extracts start and end indices from a list of dictionaries.
     Args:
-        colored_idx (list[dict]): A list of dictionaries,
             where each dictionary has 'start' and 'end' keys.
     Returns:
@@ -392,19 +394,23 @@ def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
     return starts, ends
-def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]):
     """
     Filters start and end indices to exclude any indices present in the
         ignore_indices list.
     Args:
         starts (list[int]): A list of starting indices.
-        ends (list[int]): A list of ending indices.
             Must be the same length as starts.
         ignore_indices (list[int]): A list of indices to exclude.
     Returns:
-        A tuple of two lists of integers:
             - filtered_starts
             - filtered_ends
         Returns empty lists if the input is invalid
@@ -454,9 +460,13 @@ def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]
     return filtered_starts, filtered_ends
-def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tuple[list[int], list[int]]:
     """
-    Extracts new start and end indices by splitting a range based on
         ignored indices.
     Args:
@@ -476,7 +486,7 @@ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tup
     new_starts = []
     new_ends = []
     new_start = start
     # If no indices to ignore, return the original range.
     if indexes is None or len(indexes) < 1:
         new_starts.append(start)
@@ -489,7 +499,7 @@ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tup
             continue
         elif index >= end:
             continue
         new_starts.append(new_start)
         new_ends.append(index)
@@ -498,4 +508,4 @@ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tup
     new_starts.append(new_start)
     new_ends.append(end)
-    return new_starts, new_ends

     prefix = "Partially generated by "
     for index, label in enumerate(labels):
         if label.startswith(prefix):
+            labels[index] = label[len(prefix) :]
     labels = list(set(labels))
     label = prefix
     if len(labels) == 1:
         label += labels[0]
     elif len(labels) == 2:
     return out_paragraphs
+def extract_starts_ends(
+    colored_idx: list[dict],
+) -> tuple[list[int], list[int]]:
     """
     Extracts start and end indices from a list of dictionaries.
     Args:
+        colored_idx (list[dict]): A list of dictionaries,
             where each dictionary has 'start' and 'end' keys.
     Returns:
     return starts, ends
+def filter_indices(
+    starts: list[int],
+    ends: list[int],
+    ignore_indices: list[int],
+):
     """
     Filters start and end indices to exclude any indices present in the
         ignore_indices list.
     Args:
         starts (list[int]): A list of starting indices.
+        ends (list[int]): A list of ending indices.
             Must be the same length as starts.
         ignore_indices (list[int]): A list of indices to exclude.
     Returns:
+        A tuple of two lists of integers:
             - filtered_starts
             - filtered_ends
         Returns empty lists if the input is invalid
     return filtered_starts, filtered_ends
+def extract_new_startend(
+    start: int,
+    end: int,
+    ignore_indices: list[int],
+) -> tuple[list[int], list[int]]:
     """
+    Extracts new start and end indices by splitting a range based on
         ignored indices.
     Args:
     new_starts = []
     new_ends = []
     new_start = start
     # If no indices to ignore, return the original range.
     if indexes is None or len(indexes) < 1:
         new_starts.append(start)
             continue
         elif index >= end:
             continue
         new_starts.append(new_start)
         new_ends.append(index)
     new_starts.append(new_start)
     new_ends.append(end)
+    return new_starts, new_ends

src/application/text/search_detection.py CHANGED Viewed

@@ -3,6 +3,7 @@ Author: Khanh Phan
 Date: 2024-12-04
 """
 import warnings
 import numpy as np
@@ -229,7 +230,7 @@ def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
     return alignment
-def determine_label(similarity: float) -> tuple[str | None, bool]:
     """
     Determines a label and paraphrase status based on the similarity score.

 Date: 2024-12-04
 """
+from typing import Optional
 import warnings
 import numpy as np
     return alignment
+def determine_label(similarity: float) -> tuple[Optional[str], bool]:
     """
     Determines a label and paraphrase status based on the similarity score.

src/application/text/text.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pandas as pd
+class TextDetector:
+    def __init__(self):
+        self.prediction_label: list[str] = ["UNKNOWN"]
+        self.prediction_score: list[float] = [0.0]
+        self.grouped_url_df: pd.DataFrame = pd.DataFrame()
+        # For formatting ouput tables
+        self.ordinary_user_table: list = []
+        self.fact_checker_table: list = []
+        self.governor_table: list = []