Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 26,697 Bytes

from difflib import SequenceMatcher
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
from src.application.text.entity import apply_highlight, highlight_entities
from src.application.text.helper import extract_equal_text
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.preprocessing import split_into_paragraphs
from src.application.text.search_detection import check_human, detect_text_by_relative_search


class NewsVerification():
    def __init__(self):
        self.news_text = ""
        self.news_title = ""
        self.news_content = ""
        self.news_image = ""
        
        self.text_prediction_label:list[str] = []
        self.text_prediction_score:list[float] = []
        self.text_referent_url:list[str] = []
        self.image_prediction_label:list[str] = []
        self.image_prediction_score:list[str] = []
        self.image_referent_url:list[str] = []
        self.news_prediction_label = ""
        self.news_prediction_score = -1
        
        self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
        self.aligned_sentences:list[dict] = []
        self.is_paraphrased:list[bool] = []
        
        self.ordinary_user_table:list = []
        self.fact_checker_table:list = []
        self.governor_table:list = []
        
    def load_news(self, news_title, news_content, news_image):
        self.news_text = news_title + "\n\n" + news_content
        self.news_title = news_title
        self.news_content = news_content
        self.news_image = news_image

    def determine_text_origin(self):
        """
        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.

        Args:
            text: The input text to be analyzed.

        Returns:
            str: The predicted origin of the text: 
                - "HUMAN": If the text is likely written by a human.
                - "MACHINE": If the text is likely generated by a machine.
        """
        print("CHECK TEXT:")
        print("\tFrom search engine:")
        # Classify by search engine
        input_sentences = split_into_paragraphs(self.news_text)
        current_index = 0
        previous_paraphrase = None
        ai_sentence = {
            "input_sentence": "",
            "matched_sentence": "",
            "label": "",
            "similarity": None,
            "paraphrase": False,
            "url": "",
            }

        for index, sentence in enumerate(input_sentences):
            print(f"-------index = {index}-------")
            print(f"current_sentence = {input_sentences[index]}")

            if current_index >= len(input_sentences):
                break
            if current_index >= index and index != 0 and index != len(input_sentences) - 1:
                continue
            
            paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)

            if paraphrase is False:
                # add sentence to ai_sentence
                if ai_sentence["input_sentence"] != "":
                    ai_sentence["input_sentence"] += "<br>"
                ai_sentence["input_sentence"] += sentence
                if index == len(input_sentences) - 1:
                    # add ai_sentences to align_sentences
                    text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
                    ai_sentence["label"] = text_prediction_label
                    ai_sentence["similarity"] = text_prediction_score
                    self.aligned_sentences.append(ai_sentence)
            else:
                if previous_paraphrase is False or previous_paraphrase is None:
                    # add ai_sentences to align_sentences
                    if ai_sentence["input_sentence"] != "" or current_index >= len(input_sentences):
                        text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
                        ai_sentence["label"] = text_prediction_label
                        ai_sentence["similarity"] = text_prediction_score
                        self.aligned_sentences.append(ai_sentence)
                    
                        # reset
                        ai_sentence = {
                            "input_sentence": "",
                            "matched_sentence": "",
                            "label": "",
                            "similarity": None,
                            "paraphrase": False,
                            "url": "",
                            }

                # add searched_sentences to align_sentences
                if searched_sentences["input_sentence"] != "":
                    self.found_img_url.extend(img_urls)
                    if check_human(searched_sentences):
                        searched_sentences["label"] = "HUMAN"
                    else:
                        searched_sentences["label"] = "MACHINE"
                        
                    self.aligned_sentences.append(searched_sentences)

            previous_paraphrase = paraphrase

    def detect_image_origin(self):
        print("CHECK IMAGE:")
        if self.news_image is None:
            self.image_prediction_label = "UNKNOWN"
            self.image_prediction_score = 0.0
            self.image_referent_url = None
            return
        
        for image in self.found_img_url:
            print(f"\tfound_img_url: {image}")    
        matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        matched_url, similarity = detect_image_by_reverse_search(self.news_image)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        detected_label, score = detect_image_by_ai_model(self.news_image)
        if detected_label:
            print(f"detected_label: {detected_label} ({score})")
            self.image_prediction_label = detected_label
            self.image_prediction_score = score
            self.image_referent_url = None
            return
        
        self.image_prediction_label = "UNKNOWN"
        self.image_prediction_score = 50
        self.image_referent_url = None

    def determine_news_origin(self):
        if self.text_prediction_label == "MACHINE":
            text_prediction_score = 100 - self.text_prediction_score
        elif self.text_prediction_label == "UNKNOWN":
            text_prediction_score = 50
        else:
            text_prediction_score = self.text_prediction_score
            
        if self.image_prediction_label == "MACHINE":
            image_prediction_score = 100 - self.image_prediction_score
        elif self.image_prediction_label == "UNKNOWN":
            image_prediction_score = 50
        else:
            image_prediction_score = self.image_prediction_score
        
        news_prediction_score = (text_prediction_score + image_prediction_score) / 2
        if news_prediction_score > 50:
            self.news_prediction_score = news_prediction_score
            self.news_prediction_label = "HUMAN"
        else:
            self.news_prediction_score = 100 - news_prediction_score
            self.news_prediction_label = "MACHINE"

    def generate_analysis_report(self):
        self.determine_text_origin()
        self.detect_image_origin()

    def analyze_details(self):
        ordinary_user_table = self.create_ordinary_user_table()
        fact_checker_table = self.create_fact_checker_table()
        governor_table = self.create_governor_table()

        return ordinary_user_table, fact_checker_table, governor_table
    
    def get_text_urls(self):
        return set(self.text_referent_url)


    def compare_sentences(self, sentence_1, sentence_2, position, color):
        """
        Compares two sentences and identifies common phrases, outputting their start and end positions.

        Args:
            sentence_1: The first sentence (string).
            sentence_2: The second sentence (string).

        Returns:
            A list of dictionaries, where each dictionary represents a common phrase and contains:
                - "phrase": The common phrase (string).
                - "start_1": The starting index of the phrase in sentence_1 (int).
                - "end_1": The ending index of the phrase in sentence_1 (int).
                - "start_2": The starting index of the phrase in sentence_2 (int).
                - "end_2": The ending index of the phrase in sentence_2 (int).
            Returns an empty list if no common phrases are found.  Handles edge cases like empty strings.
        """

        if not sentence_1 or not sentence_2:  # Handle empty strings
            return []

        s = SequenceMatcher(None, sentence_1, sentence_2)
        common_phrases = []

        for block in s.get_matching_blocks():
            if block.size > 0:  # Ignore zero-length matches
                start_1 = block.a
                end_1 = block.a + block.size
                start_2 = block.b
                end_2 = block.b + block.size

                phrase = sentence_1[start_1:end_1]  # Or sentence_2[start_2:end_2], they are the same

                common_phrases.append({
                    "phrase": phrase,
                    "start_1": start_1 + position,
                    "end_1": end_1 + position,
                    "start_2": start_2,
                    "end_2": end_2,
                    "color": color,
                })
        position += len(sentence_1)
        return common_phrases, position

    def create_fact_checker_table(self):
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_fact_checker_row(max_length))
        
        for aligned_sentence in self.aligned_sentences:
            if "input_sentence" not in aligned_sentence:
                continue
            
            # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                    aligned_sentence["input_sentence"],
                    aligned_sentence["matched_sentence"],
                )
            
            # Get entity-words (in pair) with colors
            entities_with_colors = highlight_entities(
                    aligned_sentence["input_sentence"],
                    aligned_sentence["matched_sentence"],
                )
            
            self.fact_checker_table.append(
                [
                    aligned_sentence,
                    equal_idx_1,
                    equal_idx_2,
                    entities_with_colors,
                ]
            )

        for row in self.fact_checker_table:
            formatted_row = self.format_text_fact_checker_row(row, max_length)
            rows.append(formatted_row)
        
        table = "\n".join(rows)
        return f"""
        <h5>Comparison between input news and source news</h5>
        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
            <thead>
                <tr>
                    <th>Input news</th>
                    <th>Source (URL provided in Originality column correspondingly)</th>
                    <th>Forensic</th>
                    <th>Originality</th>
                </tr>
            </thead>
            <tbody>
                {table}
            </tbody>
        </table>
        
        <style>
    """

    def format_text_fact_checker_row(self, row, max_length=30):
        if row[0]["input_sentence"] == "":
            return ""
        if row[0]["matched_sentence"] != "":  # source is not empty
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
            source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
            
            # Color overlapping words
            input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
            source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source)  # text, index of highlight words
            
            input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
            source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
        else:
            input_sentence = row[0]["input_sentence"]
            source_sentence = row[0]["matched_sentence"]

        label = row[0]["label"]
        score = row[0]["similarity"]
        
        url = row[0]["url"] #
        short_url = self.shorten_url(url, max_length)
        source_text_url = f"""<a href="{url}">{short_url}</a>"""
        
        return f"""
                <tr>
                    <td>{input_sentence}</td>
                    <td>{source_sentence}</td>
                    <td>{label}<br>({score*100:.2f}%)</td>
                    <td>{source_text_url}</td>
                </tr>
                """

    def format_image_fact_checker_row(self, max_length=30):        
        
        if self.image_referent_url is not None or self.image_referent_url != "":
            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
            short_url = self.shorten_url(self.image_referent_url, max_length)
            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
        else:
            source_image = "Image not found"
            source_image_url = ""

        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""


    def create_ordinary_user_table(self):
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_ordinary_user_row(max_length))
        rows.append(self.format_text_ordinary_user_row(max_length))
        table = "\n".join(rows)
        
        return f"""
        <h5>Comparison between input news and source news</h5>
        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
            <thead>
                <tr>
                    <th>Input news</th>
                    <th>Forensic</th>
                    <th>Originality</th>
                </tr>
            </thead>
            <tbody>
                {table}
            </tbody>
        </table>
        
        <style>
    """

    def format_text_ordinary_user_row(self, max_length=30):
        input_sentences = ""
        source_text_urls = ""
        label = ""
        scores = 0
        sentence_count = 0
        for index, row in enumerate(self.aligned_sentences):
            if row["input_sentence"] == "":
                continue
            input_sentences += row["input_sentence"]
            label = self.aligned_sentences[index]["label"]
            if label == "HUMAN":
                score = self.aligned_sentences[index]["similarity"]
            if label == "MACHINE":
                score = 1 - self.aligned_sentences[index]["similarity"]
            scores += score
        
            url = self.aligned_sentences[index]["url"] #
            short_url = self.shorten_url(url, max_length)
            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
            sentence_count += 1
        
        if scores == 0:
            label = "UNKNOWN"
        else:
            scores /= sentence_count
            if scores > 0.5:
                label = "HUMAN"
            else:
                label = "MACHINE"
                scores = 1 - scores
        
        return f"""
                <tr>
                    <td>{input_sentences}</td>
                    <td>{label}<br>({scores*100:.2f}%)</td>
                    <td>{source_text_urls}</td>
                </tr>
                """

    def format_image_ordinary_user_row(self, max_length=30):        
        
        if self.image_referent_url is not None or self.image_referent_url != "":
            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
            short_url = self.shorten_url(self.image_referent_url, max_length)
            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
        else:
            source_image = "Image not found"
            source_image_url = ""

        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""


    def create_governor_table(self):
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_governor_row(max_length))
        
        for aligned_sentence in self.aligned_sentences:
            if "input_sentence" not in aligned_sentence:
                continue
            
            # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                    aligned_sentence["input_sentence"],
                    aligned_sentence["matched_sentence"],
                )
            
            # Get entity-words (in pair) with colors
            entities_with_colors = highlight_entities(
                    aligned_sentence["input_sentence"],
                    aligned_sentence["matched_sentence"],
                )
            
            self.governor_table.append(
                [
                    aligned_sentence,
                    equal_idx_1,
                    equal_idx_2,
                    entities_with_colors,
                ]
            )

        formatted_row = self.format_text_governor_row(max_length)
        rows.append(formatted_row)
        
        table = "\n".join(rows)
        return f"""
            <h5>Comparison between input news and source news</h5>
            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
            <col style="width: 150px;">  <col style="width: 150px;">  <col style="width: 50px;"> <col style="width: 75px;">
                <thead>
                    <tr>
                        <th>Input news</th>
                        <th>Source (URL provided in Originality column correspondingly)</th>
                        <th>Forensic</th>
                        <th>Originality</th>
                    </tr>
                </thead>
                <tbody>
                    {table}
                </tbody>
            </table>
            
            <style>
        """

    def format_text_governor_row(self,  max_length=30):
        input_sentences = ""
        source_sentences = ""
        source_text_urls = ""
        label = ""
        scores = 0
        sentence_count = 0
        entity_count = 0
        for row in self.governor_table:
            print(f"governor_row: {row}")
            if row[0]["input_sentence"] == "":
                continue
            
            if row[0]["matched_sentence"] != "":  # source is not empty
                # highlight entities
                input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input", entity_count)
                source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source", entity_count)
                entity_count += len(row[3])
                
                # Color overlapping words
                input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
                source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source)  # text, index of highlight words
                
                input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
                source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")

            else:
                input_sentence = row[0]["input_sentence"]
                source_sentence = row[0]["matched_sentence"]
            
            input_sentences += input_sentence
            source_sentences += source_sentence
            score = row[0]["similarity"]
            label = row[0]["label"]
            if label == "HUMAN":
                score = row[0]["similarity"]
            if label == "MACHINE":
                score = 1 - row[0]["similarity"]
            scores += score
            
            url = row[0]["url"]
            short_url = self.shorten_url(url, max_length)
            source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
            sentence_count += 1
        
        if scores == 0:
            label = "UNKNOWN"
        else:
            scores /= sentence_count
            if scores > 0.5:
                label = "HUMAN"
            else:
                label = "MACHINE"
                scores = 1 - scores
        
        return f"""
                <tr>
                    <td>{input_sentences}</td>
                    <td>{source_sentences}</td>
                    <td>{label}<br>({score*100:.2f}%)</td>
                    <td>{source_text_urls}</td>
                </tr>
                """

    def format_image_governor_row(self, max_length=30):
        if self.image_referent_url is not None or self.image_referent_url != "":
            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
            short_url = self.shorten_url(self.image_referent_url, max_length)
            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
        else:
            source_image = "Image not found"
            source_image_url = ""

        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""


    def shorten_url(self, url, max_length=30):
        if url is None:
            return ""
        
        if len(url) > max_length:
            short_url = url[:max_length] + "..."
        else:
            short_url = url
        return short_url


    def color_text(self, text, colored_idx, highlighted_idx):
        paragraph = ""
        words = text.split()
        
        starts, ends = self.extract_starts_ends(colored_idx)
        starts, ends = self.filter_indices(starts, ends, highlighted_idx)

        previous_end = 0
        for start, end in zip(starts, ends):
            paragraph += " ".join(words[previous_end:start])
            
            equal_words = " ".join(words[start:end])
            paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
            
            previous_end = end
        
        # Some left words due to the punctuation separated from 
        # the highlighting text
        equal_words = " ".join(words[previous_end:])
        print(f"starts_2: {previous_end}")
        print(f"ends_2: {len(words)-1}")
        print(f"equal_words: {words[previous_end:]}")
        paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "

        return paragraph
    
    def extract_starts_ends(self, colored_idx):
        starts = []
        ends = []
        for index in colored_idx:
            starts.append(index['start'])
            ends.append(index['end'])
        return starts, ends
        
    
    def filter_indices(self, starts, ends, ignore_indices):
        """
        Filters start and end indices to exclude any indices present in the ignore_indices list.

        Args:
            starts: A list of starting indices.
            ends: A list of ending indices. Must be the same length as starts.
            ignore_indices: A list of indices to exclude.

        Returns:
            A tuple containing two new lists: filtered_starts and filtered_ends.
            Returns empty lists if the input is invalid or if all ranges are filtered out.
            Prints error messages for invalid input.
            
        Examples:
            starts = [0, 5, 10]
            ends = [3, 7, 12]
            ignore_indices = [1, 2, 11, 17]
            
            # Output: 
                starts = [0, 3, 5, 10, 12]
                ends = [0, 3, 7, 10, 12]

        """

        if len(starts) != len(ends):
            print("Error: The 'starts' and 'ends' lists must have the same length.")
            return [], []

        filtered_starts = []
        filtered_ends = []

        for i in range(len(starts)):
            start = starts[i]
            end = ends[i]

            if end < start:
                print(f"Error: End index {end} is less than start index {start} at position {i}.")
                return [], []

            
            start_end = list(range(start, end + 1, 1))
            start_end = list(set(start_end) - set(ignore_indices))
            new_start, new_end = self.extract_sequences(start_end)
            filtered_starts.extend(new_start)
            filtered_ends.extend(new_end)

        return filtered_starts, filtered_ends

    def extract_sequences(self, numbers):
        if len(numbers) == 1:
            return [numbers[0]], [numbers[0]]
        
        numbers.sort()
        starts = []
        ends = []
        for i, number in enumerate(numbers):
            if i == 0:
                start = number
                end = number
                continue
            
            if number - 1 == numbers[i-1]:
                end = number
            else:
                starts.append(start)
                ends.append(end + 1)
                start = number
                end = number
        
            if i == len(numbers) - 1:
                starts.append(start)
                ends.append(end + 1)
        
        return starts, ends