Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 11,751 Bytes

from difflib import SequenceMatcher
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.search_detection import check_human, detect_text_by_relative_search


class NewsVerification():
    def __init__(self):
        self.news_text = ""
        self.news_title = ""
        self.news_content = ""
        self.news_image = ""
        
        self.text_prediction_label = ""
        self.text_prediction_score = -1
        self.text_referent_url = None
        self.image_prediction_label = ""
        self.image_prediction_score = -1
        self.image_referent_url = None
        self.news_prediction_label = ""
        self.news_prediction_score = -1
        
        self.found_img_url = []
        self.aligned_sentences = []
        self.is_paraphrased = False
        
    def load_news(self, news_title, news_content, news_image):
        self.news_text = news_title + "\n\n" + news_content
        self.news_title = news_title
        self.news_content = news_content
        self.news_image = news_image

    def determine_text_origin(self):
        """
        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.

        Args:
            text: The input text to be analyzed.

        Returns:
            str: The predicted origin of the text: 
                - "HUMAN": If the text is likely written by a human.
                - "MACHINE": If the text is likely generated by a machine.
        """
        print("CHECK TEXT:")
        print("\tFrom search engine:")
        # Classify by search engine
        self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
        
        if self.is_paraphrased is False:
            self.text_prediction_label = "UNKNOWN"
        else:
            self.text_prediction_score = 100
            if check_human(self.aligned_sentences):
                self.text_prediction_label = "HUMAN"
            else:
                self.text_prediction_label = "MACHINE"
        
        # Classify text by AI model
        print("\tFrom AI model:")
        if self.text_prediction_label == "UNKNOWN":
            self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
            self.text_prediction_score *= 100

    def detect_image_origin(self):
        print("CHECK IMAGE:")
        if self.news_image is None:
            self.image_prediction_label = "UNKNOWN"
            self.image_prediction_score = 0.0
            self.image_referent_url = None
            return
        
        print(f"\t: Img path: {self.news_image}")    
        matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        matched_url, similarity = detect_image_by_reverse_search(self.news_image)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        detected_label, score = detect_image_by_ai_model(self.news_image)
        if detected_label:
            self.image_prediction_label = detected_label
            self.image_prediction_score = score
            self.image_referent_url = None
            return
        
        self.image_prediction_label = "UNKNOWN"
        self.image_prediction_score = 50
        self.image_referent_url = None

    def determine_news_origin(self):
        if self.text_prediction_label == "MACHINE":
            text_prediction_score = 100 - self.text_prediction_score
        elif self.text_prediction_label == "UNKNOWN":
            text_prediction_score = 50
        else:
            text_prediction_score = self.text_prediction_score
            
        if self.image_prediction_label == "MACHINE":
            image_prediction_score = 100 - self.image_prediction_score
        elif self.image_prediction_label == "UNKNOWN":
            image_prediction_score = 50
        else:
            image_prediction_score = self.image_prediction_score
        
        news_prediction_score = (text_prediction_score + image_prediction_score) / 2
        if news_prediction_score > 50:
            self.news_prediction_score = news_prediction_score
            self.news_prediction_label = "HUMAN"
        else:
            self.news_prediction_score = 100 - news_prediction_score
            self.news_prediction_label = "MACHINE"

    def generate_analysis_report(self):
        self.determine_text_origin()
        self.detect_image_origin()
        self.determine_news_origin()
        
        # Forensic analysis
        if self.text_prediction_label == "MACHINE":
            text_prediction_label = "The text is modified by GPT-4o (AI)"
        else:
            text_prediction_label = "The text is written by HUMAN"
        
        if self.image_prediction_label == "MACHINE":
            image_prediction_label = "The image is generated by Dall-e (AI)"
        else:
            image_prediction_label = "The image is generated by HUMAN"    

        if self.news_prediction_label == "MACHINE":
            news_prediction_label = "The whole news generated by AI"
        else:
            news_prediction_label = "The whole news written by HUMAN"
        
        # Misinformation analysis
        out_of_context_results = "cohesive"
        if out_of_context_results == "cohesive":
            out_of_context_results = "The input news is cohesive (non-out-of-context)"
        else:
            out_of_context_results = "The input news is out-of-context"
        out_of_context_prediction_score = 96.7
        
        # Description
        description = "The description should be concise, clear, and aimed at helping general readers understand the case." 
        
        if self.text_referent_url is None:
            referred_news = "<li>No referent information</li>"
        else:
            if len(self.text_referent_url) > 40:
                url_max_length = 40
            else: 
                url_max_length = len(self.text_referent_url)
            
            referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">{"Referred news: " + self.text_referent_url[:url_max_length] + "..."}</a></li>"""
            
        if self.image_referent_url is None:
            referred_image = "<li>No referent information</li>"
        else:
            if len(self.image_referent_url) > 40:
                url_max_length = 40
            else: 
                url_max_length = len(self.text_referent_url)
            referred_image = f"""<li><a href="{self.image_referent_url}" target="_blank">{"Referred news: " + self.image_referent_url[:url_max_length] + "..."}</a></li>"""
        
        html_template = f"""    
        <div>
            <h3>Originality:</h3>
            <ul>
                {referred_news}
                {referred_image}
            </ul>
        </div>

        <div>
            <h3>Forensic:</h3>
            <b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
            <ul>
                <li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
                <li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
            </ul>
        </div>

        <div>
            <h3>Misinformation (placeholder):</h3>
            <ul>
                <li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
            </ul>
        </div>

        <div>
            <h3>Description (optional, placeholder):</h3>
            <ul>
                <li>{description}</li>
            </ul>
        </div>
        """

        return html_template


    def analyze_details(self):
        self.aligned_sentences
        final_table = []

        for pair in self.aligned_sentences:
            input_words, source_words, input_indexes, source_indexes = (
                self.highlight_overlap_by_word_to_list(
                    pair["input_sentence"],
                    pair["matched_sentence"],
                )
            )
            final_table.append(
                (input_words, source_words, input_indexes, source_indexes),
            )
        
        if len(final_table) != 0:
            html_table = self.create_table(final_table)
        else:
            html_table = ""
        return html_table
        
    def highlight_overlap_by_word_to_list(self, text1, text2):
        """
        Return
        - list of words in text1
        - list of words in text2
        - list of index of highlight words in text 1
        - list of index of highlight words in text 2
        """
        # Tách chuỗi thành các từ (word) dựa vào khoảng trắng
        words1 = text1.split()
        words2 = text2.split()

        index1 = []
        index2 = []

        # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
        matcher = SequenceMatcher(None, words1, words2)
        
        highlighted_text1 = []
        highlighted_text2 = []

        # Theo dõi vị trí hiện tại trong words1 và words2
        current_pos1 = 0
        current_pos2 = 0

        # Lặp qua các đoạn so khớp
        for match in matcher.get_matching_blocks():
            start1, start2, length = match

            # Thêm các từ không trùng lặp vào (giữ nguyên)
            highlighted_text1.extend(words1[current_pos1:start1])
            highlighted_text2.extend(words2[current_pos2:start2])

            if length > 0:
                for i in range(start1, start1 + length):
                    index1.append(i)
                for i in range(start2, start2 + length):
                    index2.append(i)
                
            # Cập nhật vị trí hiện tại
            current_pos1 = start1 + length
            current_pos2 = start2 + length
        
        return words1, words2, index1, index2

    def create_table(self, data):
        table_rows = "\n".join([self.format_pair(pair) for pair in data])
        return f"""
        <h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
            <thead>
                <tr>
                    <th>Input sentence</th>
                    <th>Source sentence</th>
                </tr>
            </thead>
            <tbody>
                {table_rows}
            </tbody>
        </table>
    """
    
    def format_pair(self, pair):
        input_sentence = self.highlight_text(pair[0], pair[2])
        source_sentence = self.highlight_text(pair[1], pair[3])
        return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"

    def highlight_text(self, words, indexes):
        final_words = words
        for index in indexes:
            final_words[index] = (
                f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
            )
        return " ".join(final_words)