Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 29,554 Bytes

from difflib import SequenceMatcher

import pandas as pd

from src.application.image.image_detection import (
    detect_image_by_ai_model,
    detect_image_by_reverse_search,
    detect_image_from_news_image,
)
from src.application.text.entity import (
    apply_highlight,
    highlight_entities,
)
from src.application.text.helper import extract_equal_text
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.preprocessing import split_into_paragraphs
from src.application.text.search_detection import (
    check_human,
    detect_text_by_relative_search,
    find_text_source,
)


class NewsVerification:
    def __init__(self):
        self.news_text = ""
        self.news_title = ""
        self.news_content = ""
        self.news_image = ""

        self.text_prediction_label: list[str] = []
        self.text_prediction_score: list[float] = []
        self.text_referent_url: list[str] = []
        self.image_prediction_label: list[str] = []
        self.image_prediction_score: list[str] = []
        self.image_referent_url: list[str] = []
        self.news_prediction_label = ""
        self.news_prediction_score = -1

        self.found_img_url: list[str] = []
        self.aligned_sentences: list[dict] = []
        self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
            columns=[
                "input_sentence",
                "matched_sentence",
                "label",
                "similarity",
                "paraphrase",
                "url",
                "group",
                "entities",
            ],
        )
        self.is_paraphrased: list[bool] = []

        self.ordinary_user_table: list = []
        self.fact_checker_table: list = []
        self.governor_table: list = []
        self.entities_with_colors = []

    def load_news(self, news_title, news_content, news_image):
        self.news_text = news_title + "\n\n" + news_content
        self.news_title = news_title
        self.news_content = news_content
        self.news_image = news_image

    def determine_text_origin(self):
        """
        Determines the origin of the given text based on paraphrasing detection
            and human authorship analysis.

        Args:
            text: The input text to be analyzed.

        Returns:
            str: The predicted origin of the text:
                - "HUMAN": If the text is likely written by a human.
                - "MACHINE": If the text is likely generated by a machine.
        """
        print("CHECK TEXT:")
        print("\tFrom search engine:")
        # Classify by search engine
        input_sentences = split_into_paragraphs(self.news_text)
        current_index = 0
        previous_paraphrase = None
        ai_sentence = {
            "input_sentence": "",
            "matched_sentence": "",
            "label": "",
            "similarity": None,
            "paraphrase": False,
            "url": "",
        }

        for index, sentence in enumerate(input_sentences):
            print(f"-------index = {index}-------")
            print(f"current_sentence = {input_sentences[index]}")

            if current_index >= len(input_sentences):
                break
            if (
                current_index > index
                and index != 0
                and index != len(input_sentences) - 1
            ):
                continue

            (
                paraphrase,
                text_url,
                searched_sentences,
                img_urls,
                current_index,
            ) = detect_text_by_relative_search(input_sentences, index)

            if paraphrase is False:
                # add sentence to ai_sentence
                if ai_sentence["input_sentence"] != "":
                    ai_sentence["input_sentence"] += "<br>"
                ai_sentence["input_sentence"] += sentence
                if index == len(input_sentences) - 1:
                    # add ai_sentences to align_sentences
                    text_prediction_label, text_prediction_score = (
                        detect_text_by_ai_model(ai_sentence["input_sentence"])
                    )
                    ai_sentence["label"] = text_prediction_label
                    ai_sentence["similarity"] = text_prediction_score
                    self.aligned_sentences.append(ai_sentence)
            else:
                if previous_paraphrase is False or previous_paraphrase is None:
                    # add ai_sentences to align_sentences
                    if ai_sentence[
                        "input_sentence"
                    ] != "" or current_index >= len(input_sentences):
                        text_prediction_label, text_prediction_score = (
                            detect_text_by_ai_model(
                                ai_sentence["input_sentence"],
                            )
                        )
                        ai_sentence["label"] = text_prediction_label
                        ai_sentence["similarity"] = text_prediction_score
                        self.aligned_sentences.append(ai_sentence)

                        # reset
                        ai_sentence = {
                            "input_sentence": "",
                            "matched_sentence": "",
                            "label": "",
                            "similarity": None,
                            "paraphrase": False,
                            "url": "",
                        }

                # add searched_sentences to align_sentences
                if searched_sentences["input_sentence"] != "":
                    self.found_img_url.extend(img_urls)
                    if check_human(searched_sentences):
                        searched_sentences["label"] = "HUMAN"
                    else:
                        searched_sentences["label"] = "MACHINE"

                    self.aligned_sentences.append(searched_sentences)

            previous_paraphrase = paraphrase

    def determine_text_origin_2(self):
        """
        Determines the origin of the given text based on paraphrasing detection
            and human authorship analysis.

        Args:
            text: The input text to be analyzed.

        Returns:
            str: The predicted origin of the text:
                - "HUMAN": If the text is likely written by a human.
                - "MACHINE": If the text is likely generated by a machine.
        """
        print("CHECK TEXT:")
        print("\tFrom search engine:")
        # Classify by search engine
        input_sentences = split_into_paragraphs(self.news_text)
        for _ in range(5):
            self.aligned_sentences_df = pd.concat(
                [self.aligned_sentences_df, pd.DataFrame([{}])],
                ignore_index=False,
            )

        for index, sentence in enumerate(input_sentences):
            print(f"-------index = {index}-------")
            print(f"current_sentence = {input_sentences[index]}")

            if self.aligned_sentences_df["url"] is not None:
                continue

            self.aligned_sentences_df, img_urls = find_text_source(
                input_sentences[index],
                self.aligned_sentences_df,
            )

    def detect_image_origin(self):
        print("CHECK IMAGE:")
        if self.news_image is None:
            self.image_prediction_label = "UNKNOWN"
            self.image_prediction_score = 0.0
            self.image_referent_url = None
            return

        for image in self.found_img_url:
            print(f"\tfound_img_url: {image}")
        matched_url, similarity = detect_image_from_news_image(
            self.news_image,
            self.found_img_url,
        )
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return

        matched_url, similarity = detect_image_by_reverse_search(
            self.news_image,
        )
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return

        detected_label, score = detect_image_by_ai_model(self.news_image)
        if detected_label:
            print(f"detected_label: {detected_label} ({score})")
            self.image_prediction_label = detected_label
            self.image_prediction_score = score
            self.image_referent_url = None
            return

        self.image_prediction_label = "UNKNOWN"
        self.image_prediction_score = 50
        self.image_referent_url = None

    def determine_news_origin(self):
        if self.text_prediction_label == "MACHINE":
            text_prediction_score = 100 - self.text_prediction_score
        elif self.text_prediction_label == "UNKNOWN":
            text_prediction_score = 50
        else:
            text_prediction_score = self.text_prediction_score

        if self.image_prediction_label == "MACHINE":
            image_prediction_score = 100 - self.image_prediction_score
        elif self.image_prediction_label == "UNKNOWN":
            image_prediction_score = 50
        else:
            image_prediction_score = self.image_prediction_score

        news_prediction_score = (
            text_prediction_score + image_prediction_score
        ) / 2
        if news_prediction_score > 50:
            self.news_prediction_score = news_prediction_score
            self.news_prediction_label = "HUMAN"
        else:
            self.news_prediction_score = 100 - news_prediction_score
            self.news_prediction_label = "MACHINE"

    def generate_analysis_report(self):
        self.determine_text_origin()
        self.detect_image_origin()

    def analyze_details(self):
        entities_with_colors = []
        for index, aligned_sentence in enumerate(self.aligned_sentences):
            # Get entity-words (in pair) with colors
            entities_with_colors = highlight_entities(
                aligned_sentence["input_sentence"],
                aligned_sentence["matched_sentence"],
            )
            self.aligned_sentences[index]["entities"] = entities_with_colors

        ordinary_user_table = self.create_ordinary_user_table()
        fact_checker_table = self.create_fact_checker_table()
        governor_table = self.create_governor_table()

        return ordinary_user_table, fact_checker_table, governor_table

    def get_text_urls(self):
        return set(self.text_referent_url)

    def compare_sentences(self, sentence_1, sentence_2, position, color):
        """
        Compares two sentences and identifies common phrases,
            outputting their start and end positions.

        """

        if not sentence_1 or not sentence_2:  # Handle empty strings
            return []

        s = SequenceMatcher(None, sentence_1, sentence_2)
        common_phrases = []

        for block in s.get_matching_blocks():
            if block.size > 0:  # Ignore zero-length matches
                start_1 = block.a
                end_1 = block.a + block.size
                start_2 = block.b
                end_2 = block.b + block.size

                phrase = sentence_1[
                    start_1:end_1
                ]  # Or sentence_2[start_2:end_2], they are the same

                common_phrases.append(
                    {
                        "phrase": phrase,
                        "start_1": start_1 + position,
                        "end_1": end_1 + position,
                        "start_2": start_2,
                        "end_2": end_2,
                        "color": color,
                    },
                )
        position += len(sentence_1)
        return common_phrases, position

    def create_fact_checker_table(self):
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_fact_checker_row(max_length))

        for aligned_sentence in self.aligned_sentences:
            if "input_sentence" not in aligned_sentence:
                continue

            # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                aligned_sentence["input_sentence"],
                aligned_sentence["matched_sentence"],
            )

            # Get entity-words (in pair) with colors
            # entities_with_colors = highlight_entities(
            #         aligned_sentence["input_sentence"],
            #         aligned_sentence["matched_sentence"],
            #     )

            self.fact_checker_table.append(
                [
                    aligned_sentence,
                    equal_idx_1,
                    equal_idx_2,
                    aligned_sentence["entities"],
                ],
            )

        for row in self.fact_checker_table:
            formatted_row = self.format_text_fact_checker_row(row, max_length)
            rows.append(formatted_row)

        table = "\n".join(rows)
        return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (corresponding URL provided in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>

<style>
    """

    def format_text_fact_checker_row(self, row, max_length=30):
        entity_count = 0
        if row[0]["input_sentence"] == "":
            return ""
        if row[0]["matched_sentence"] != "":  # source is not empty
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input_sentence"],
                row[3],
                "input",
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["matched_sentence"],
                row[3],
                "source",
            )
            entity_count = len(row[3])

            # Color overlapping words
            input_sentence = self.color_text(
                input_sentence,
                row[1],
                highlight_idx_input,
            )  # text, index of highlight words
            source_sentence = self.color_text(
                source_sentence,
                row[2],
                highlight_idx_source,
            )  # text, index of highlight words

            input_sentence = input_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")
            source_sentence = source_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")
        else:
            input_sentence = row[0]["input_sentence"]
            source_sentence = row[0]["matched_sentence"]

        label = row[0]["label"]
        score = row[0]["similarity"]

        url = row[0]["url"]  #
        short_url = self.shorten_url(url, max_length)
        source_text_url = f"""<a href="{url}">{short_url}</a>"""

        entity_count_text = self.get_entity_count_text(entity_count)

        return f"""
                <tr>
                    <td>{input_sentence}</td>
                    <td>{source_sentence}</td>
                    <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>  # noqa: E501
                    <td>{source_text_url}</td>
                </tr>
                """

    def format_image_fact_checker_row(self, max_length=30):

        if (
            self.image_referent_url is not None
            or self.image_referent_url != ""
        ):
            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""  # noqa: E501
            short_url = self.shorten_url(self.image_referent_url, max_length)
            source_image_url = (
                f"""<a href="{self.image_referent_url}">{short_url}</a>"""
            )
        else:
            source_image = "Image not found"
            source_image_url = ""

        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""  # noqa: E501

    def create_ordinary_user_table(self):
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_ordinary_user_row(max_length))
        rows.append(self.format_text_ordinary_user_row(max_length))
        table = "\n".join(rows)

        return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">  # noqa: E501
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>

<style>
    """

    def format_text_ordinary_user_row(self, max_length=30):
        input_sentences = ""
        source_text_urls = ""
        label = ""
        scores = 0
        sentence_count = 0
        for index, row in enumerate(self.aligned_sentences):
            if row["input_sentence"] == "":
                continue
            input_sentences += row["input_sentence"] + "<br><br>"
            label = self.aligned_sentences[index]["label"]

            url = self.aligned_sentences[index]["url"]  #
            short_url = self.shorten_url(url, max_length)
            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
            sentence_count += 1

        scores, label = self.calculate_score_label()

        return f"""
                <tr>
                    <td>{input_sentences}</td>
                    <td>{label}<br>({scores * 100:.2f}%)</td>
                    <td>{source_text_urls}</td>
                </tr>
                """

    def format_image_ordinary_user_row(self, max_length=30):

        if (
            self.image_referent_url is not None
            or self.image_referent_url != ""
        ):
            short_url = self.shorten_url(self.image_referent_url, max_length)
            source_image_url = (
                f"""<a href="{self.image_referent_url}">{short_url}</a>"""
            )
        else:
            # source_image = "Image not found"
            source_image_url = ""

        return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""  # noqa: E501

    def create_governor_table(self):
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_governor_row(max_length))

        for aligned_sentence in self.aligned_sentences:
            if "input_sentence" not in aligned_sentence:
                continue

            # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                aligned_sentence["input_sentence"],
                aligned_sentence["matched_sentence"],
            )

            # Get entity-words (in pair) with colors
            # entities_with_colors = highlight_entities(
            #         aligned_sentence["input_sentence"],
            #         aligned_sentence["matched_sentence"],
            #     )

            self.governor_table.append(
                [
                    aligned_sentence,
                    equal_idx_1,
                    equal_idx_2,
                    aligned_sentence["entities"],
                ],
            )

        formatted_row = self.format_text_governor_row(max_length)
        rows.append(formatted_row)

        table = "\n".join(rows)
        return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (corresponding URL provided in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>

<style>
        """

    def format_text_governor_row(self, max_length=30):
        input_sentences = ""
        source_sentences = ""
        source_text_urls = ""
        label = ""
        sentence_count = 0
        entity_count = 0
        for row in self.governor_table:
            print(f"governor_row: {row}")
            if row[0]["input_sentence"] == "":
                continue

            if row[0]["matched_sentence"] != "":  # source is not empty
                # highlight entities
                input_sentence, highlight_idx_input = apply_highlight(
                    row[0]["input_sentence"],
                    row[3],
                    "input",
                    entity_count,
                )
                source_sentence, highlight_idx_source = apply_highlight(
                    row[0]["matched_sentence"],
                    row[3],
                    "source",
                    entity_count,
                )
                entity_count += len(row[3])

                # Color overlapping words
                input_sentence = self.color_text(
                    input_sentence,
                    row[1],
                    highlight_idx_input,
                )  # text, index of highlight words
                source_sentence = self.color_text(
                    source_sentence,
                    row[2],
                    highlight_idx_source,
                )  # text, index of highlight words

                input_sentence = input_sentence.replace(
                    "span_style",
                    "span style",
                ).replace("1px_4px", "1px 4px")
                source_sentence = source_sentence.replace(
                    "span_style",
                    "span style",
                ).replace("1px_4px", "1px 4px")

            else:
                input_sentence = row[0]["input_sentence"]
                source_sentence = row[0]["matched_sentence"]

            # convert score to HUMAN-based score:
            input_sentences += input_sentence + "<br><br>"
            source_sentences += source_sentence + "<br><br>"

            url = row[0]["url"]
            short_url = self.shorten_url(url, max_length)
            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
            sentence_count += 1

        score, label = self.calculate_score_label()
        entity_count_text = self.get_entity_count_text(entity_count)

        return f"""
<tr>
    <td>{input_sentences}</td>
    <td>{source_sentences}</td>
    <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>
    <td>{source_text_urls}</td>
</tr>
                """

    def format_image_governor_row(self, max_length=30):
        if (
            self.image_referent_url is not None
            or self.image_referent_url != ""
        ):
            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""  # noqa: E501
            short_url = self.shorten_url(self.image_referent_url, max_length)
            source_image_url = (
                f"""<a href="{self.image_referent_url}">{short_url}</a>"""
            )
        else:
            source_image = "Image not found"
            source_image_url = ""

        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""  # noqa: E501

    def get_entity_count_text(self, entity_count):
        if entity_count <= 0:
            entity_count_text = ""
        elif entity_count == 1:
            entity_count_text = "with altered entity"
        else:
            entity_count_text = "with altered entities"
        return entity_count_text

    def shorten_url(self, url, max_length=30):
        if url is None:
            return ""

        if len(url) > max_length:
            short_url = url[:max_length] + "..."
        else:
            short_url = url
        return short_url

    def color_text(self, text, colored_idx, highlighted_idx):
        paragraph = ""
        words = text.split()

        starts, ends = self.extract_starts_ends(colored_idx)
        starts, ends = self.filter_indices(starts, ends, highlighted_idx)

        previous_end = 0
        for start, end in zip(starts, ends):
            paragraph += " ".join(words[previous_end:start])

            equal_words = " ".join(words[start:end])
            paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "

            previous_end = end

        # Some left words due to the punctuation separated from
        # the highlighting text
        equal_words = " ".join(words[previous_end:])
        print(f"starts_2: {previous_end}")
        print(f"ends_2: {len(words) - 1}")
        print(f"equal_words: {words[previous_end:]}")
        paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "

        return paragraph

    def extract_starts_ends(self, colored_idx):
        starts = []
        ends = []
        for index in colored_idx:
            starts.append(index["start"])
            ends.append(index["end"])
        return starts, ends

    def filter_indices(self, starts, ends, ignore_indices):
        """
        Filters start and end indices to exclude any indices present in the
            ignore_indices list.

        Args:
            starts: A list of starting indices.
            ends: A list of ending indices. Must be the same length as starts.
            ignore_indices: A list of indices to exclude.

        Returns:
            A tuple of two lists: filtered_starts and filtered_ends.
            Returns empty lists if the input is invalid
                or if all ranges are filtered out.
            Prints error messages for invalid input.

        Examples:
            starts = [0, 5, 10]
            ends = [3, 7, 12]
            ignore_indices = [1, 2, 11, 17]

            # Output:
                starts = [0, 3, 5, 10, 12]
                ends = [0, 3, 7, 10, 12]

        """

        if len(starts) != len(ends):
            print(
                "Error: The 'starts' and 'ends' lists must have the same length.",  # noqa: E501
            )
            return [], []

        filtered_starts = []
        filtered_ends = []

        for i in range(len(starts)):
            start = starts[i]
            end = ends[i]

            if end < start:
                print(
                    f"Error: End index {end} is less than start index {start} at position {i}.",  # noqa: E501
                )
                return [], []

            start_end = list(range(start, end + 1, 1))
            start_end = list(set(start_end) - set(ignore_indices))
            new_start, new_end = self.extract_sequences(start_end)
            filtered_starts.extend(new_start)
            filtered_ends.extend(new_end)

        return filtered_starts, filtered_ends

    def extract_sequences(self, numbers):
        if len(numbers) == 1:
            return [numbers[0]], [numbers[0]]

        numbers.sort()
        starts = []
        ends = []
        for i, number in enumerate(numbers):
            if i == 0:
                start = number
                end = number
                continue

            if number - 1 == numbers[i - 1]:
                end = number
            else:
                starts.append(start)
                ends.append(end + 1)
                start = number
                end = number

            if i == len(numbers) - 1:
                starts.append(start)
                ends.append(end + 1)

        return starts, ends

    def calculate_score_label(self):
        human_score = []
        machine_score = []
        machine_flag = False
        for sentence in self.aligned_sentences:
            if sentence["input_sentence"] == "":
                continue
            if sentence["label"] == "HUMAN":
                human_score.append(sentence["similarity"])
            elif sentence["label"] == "MACHINE":
                machine_score.append(1 - sentence["similarity"])
                machine_flag = True

        if machine_flag is True and len(machine_score) > 0:
            # average value of machine_score
            machine_score_avg = sum(machine_score) / len(machine_score)
            if machine_score_avg < 0.5:
                machine_score_avg = 1 - machine_score_avg
            return machine_score_avg, "MACHINE"
        elif machine_flag is False and len(human_score) > 0:
            # average value of human_score
            human_score_avg = sum(human_score) / len(human_score)
            return human_score_avg, "HUMAN"
        else:
            return 0, "UNKNOWN"