Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 15,085 Bytes

da7dbd0
d952fbe
 
da7dbd0
 
d952fbe
da7dbd0
1ce1659
 
badcb49
da7dbd0
 
 
 
 
 
d952fbe
 
 
 
 
 
da7dbd0
 
 
d952fbe
 
 
 
da7dbd0
 
 
 
 
 
1ce1659
da7dbd0
 
 
1ce1659
da7dbd0
 
 
 
 
 
 
 
 
 
 
d952fbe
 
 
da7dbd0
d952fbe
 
 
 
 
 
 
 
 
 
 
 
 
da7dbd0
d952fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce1659
da7dbd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce1659
da7dbd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d952fbe
 
 
da7dbd0
 
d952fbe
da7dbd0
d952fbe
 
 
da7dbd0
 
 
 
 
d952fbe
 
 
 
da7dbd0
d952fbe
da7dbd0
 
 
d952fbe
 
da7dbd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d952fbe
da7dbd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d952fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da7dbd0
d952fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da7dbd0
d952fbe
da7dbd0
 
 
d952fbe
 
 
 
da7dbd0
 
 
d952fbe
da7dbd0
 
d952fbe
 
1ce1659
da7dbd0
d952fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ce1659
da7dbd0

from difflib import SequenceMatcher
import difflib
from src.application.highlight_text import generate_color
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.preprocessing import split_into_sentences
from src.application.text.search_detection import check_human, detect_text_by_relative_search


class NewsVerification():
    def __init__(self):
        self.news_text = ""
        self.news_title = ""
        self.news_content = ""
        self.news_image = ""
        
        self.text_prediction_label:list[str] = []
        self.text_prediction_score:list[float] = []
        self.text_referent_url:list[str] = []
        self.image_prediction_label:list[str] = []
        self.image_prediction_score:list[str] = []
        self.image_referent_url:list[str] = []
        self.news_prediction_label = ""
        self.news_prediction_score = -1
        
        self.found_img_url:list[str] = []
        self.aligned_sentences:list[dict] = []
        self.is_paraphrased:list[bool] = []
        self.analyzed_table:list[list] = []
        
    def load_news(self, news_title, news_content, news_image):
        self.news_text = news_title + "\n\n" + news_content
        self.news_title = news_title
        self.news_content = news_content
        self.news_image = news_image

    def determine_text_origin(self):
        """
        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.

        Args:
            text: The input text to be analyzed.

        Returns:
            str: The predicted origin of the text: 
                - "HUMAN": If the text is likely written by a human.
                - "MACHINE": If the text is likely generated by a machine.
        """
        print("CHECK TEXT:")
        print("\tFrom search engine:")
        # Classify by search engine
        input_sentences = split_into_sentences(self.news_text)
        for sentence in input_sentences:
            paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence)
        
            text_prediction_label = "UNKNOWN"
            if paraphrase is False:
                # Classify text by AI model
                print("\tFrom AI model:")
                text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence)
                if aligned_sentence == []:
                    aligned_sentence = {
                        "input_sentence": sentence,
                        "matched_sentence": "",
                        "similarity": text_prediction_score,
                        "is_paraphrase_sentence": False,
                        "url": "",
                    }
            else:
                self.found_img_url.extend(img_urls)
                text_prediction_score = aligned_sentence["similarity"]
                if check_human(aligned_sentence):
                    text_prediction_label = "HUMAN"
                else:
                    text_prediction_label = "MACHINE"
            
            print(f"\ttext_prediction_label: {text_prediction_label}\n")
            self.text_prediction_label.append(text_prediction_label)
            self.aligned_sentences.append(aligned_sentence)
            self.is_paraphrased.append(paraphrase)
            self.text_referent_url.append(text_url)
            self.text_prediction_score.append(text_prediction_score)
            paraphrase = False
            text_url = ""
            aligned_sentence = {}
            img_urls = []

    def detect_image_origin(self):
        print("CHECK IMAGE:")
        if self.news_image is None:
            self.image_prediction_label = "UNKNOWN"
            self.image_prediction_score = 0.0
            self.image_referent_url = None
            return
        
        print(f"\t: Img path: {self.news_image}")    
        matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        matched_url, similarity = detect_image_by_reverse_search(self.news_image)
        if matched_url is not None:
            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
            self.image_prediction_label = "HUMAN"
            self.image_prediction_score = similarity
            self.image_referent_url = matched_url
            return
        
        detected_label, score = detect_image_by_ai_model(self.news_image)
        if detected_label:
            self.image_prediction_label = detected_label
            self.image_prediction_score = score
            self.image_referent_url = None
            return
        
        self.image_prediction_label = "UNKNOWN"
        self.image_prediction_score = 50
        self.image_referent_url = None

    def determine_news_origin(self):
        if self.text_prediction_label == "MACHINE":
            text_prediction_score = 100 - self.text_prediction_score
        elif self.text_prediction_label == "UNKNOWN":
            text_prediction_score = 50
        else:
            text_prediction_score = self.text_prediction_score
            
        if self.image_prediction_label == "MACHINE":
            image_prediction_score = 100 - self.image_prediction_score
        elif self.image_prediction_label == "UNKNOWN":
            image_prediction_score = 50
        else:
            image_prediction_score = self.image_prediction_score
        
        news_prediction_score = (text_prediction_score + image_prediction_score) / 2
        if news_prediction_score > 50:
            self.news_prediction_score = news_prediction_score
            self.news_prediction_label = "HUMAN"
        else:
            self.news_prediction_score = 100 - news_prediction_score
            self.news_prediction_label = "MACHINE"

    def generate_analysis_report(self):
        self.determine_text_origin()
        self.detect_image_origin()

    def analyze_details(self):
        self.analyzed_table = []
        # IMAGES:
        
        
        # TEXT
        for pair in self.aligned_sentences:
            print(f"pair: {pair}")
            if "input_sentence" not in pair:
                continue
            input_words, source_words, input_indexes, source_indexes = (
                self.highlight_overlap_by_word_to_list(
                    pair["input_sentence"],
                    pair["matched_sentence"],
                )
                # self.compare_sentences(
                #     pair["input_sentence"],
                #     pair["matched_sentence"],
                # )
            )
            self.analyzed_table.append(
                (input_words, source_words, input_indexes, source_indexes),
            )
        
        if len(self.analyzed_table) != 0:
            html_table = self.create_table()
        else:
            html_table = ""
        return html_table
        
    def highlight_overlap_by_word_to_list(self, text1, text2):
        """
        Return
        - list of words in text1
        - list of words in text2
        - list of index of highlight words in text 1
        - list of index of highlight words in text 2
        """
        # Tách chuỗi thành các từ (word) dựa vào khoảng trắng
        words1 = text1.split()
        words2 = text2.split()

        index1 = []
        index2 = []

        # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
        matcher = SequenceMatcher(None, words1, words2)
        
        highlighted_text1 = []
        highlighted_text2 = []

        # Theo dõi vị trí hiện tại trong words1 và words2
        current_pos1 = 0
        current_pos2 = 0

        # Lặp qua các đoạn so khớp
        for match in matcher.get_matching_blocks():
            start1, start2, length = match
            print(start1, start2, length)

            # Thêm các từ không trùng lặp vào (giữ nguyên)
            highlighted_text1.extend(words1[current_pos1:start1])
            highlighted_text2.extend(words2[current_pos2:start2])

            if length > 0:
                for i in range(start1, start1 + length):
                    index1.append(i)
                for i in range(start2, start2 + length):
                    index2.append(i)
                
            # Cập nhật vị trí hiện tại
            current_pos1 = start1 + length
            current_pos2 = start2 + length
        
        return words1, words2, index1, index2
    
    
    def get_text_urls(self):
        return set(self.text_referent_url)
    
    def generate_colors_list(self, set_urls):
        color_dict = {}
        num_urls = len(set_urls)
        for i in range(num_urls):
            color_dict[i] = generate_color(i, num_urls)
        
        return color_dict            

    def analyze_details_2(self):
        html_text = ""
        
        self.analyzed_table = []
        # TEXT
        # Assign unique colors to each index
        set_urls = self.get_text_urls()
        color_dict = self.generate_colors_list(set_urls)

        # position of the color in the input contents
        position = 0
        for pair in self.aligned_sentences:
            if "input_sentence" not in pair:
                continue
            common_phrases, position = self.compare_sentences(
                pair["input_sentence"],
                pair["matched_sentence"],
                position,
                color_dict["0"],  # TODO: set color
            )
            
        
        if len(self.analyzed_table) != 0:
            html_table = self.create_table()
        else:
            html_table = ""
        return html_text, html_table
        
    def compare_sentences(self, sentence_1, sentence_2, position, color):
        """
        Compares two sentences and identifies common phrases, outputting their start and end positions.

        Args:
            sentence_1: The first sentence (string).
            sentence_2: The second sentence (string).

        Returns:
            A list of dictionaries, where each dictionary represents a common phrase and contains:
                - "phrase": The common phrase (string).
                - "start_1": The starting index of the phrase in sentence_1 (int).
                - "end_1": The ending index of the phrase in sentence_1 (int).
                - "start_2": The starting index of the phrase in sentence_2 (int).
                - "end_2": The ending index of the phrase in sentence_2 (int).
            Returns an empty list if no common phrases are found.  Handles edge cases like empty strings.
        """

        if not sentence_1 or not sentence_2:  # Handle empty strings
            return []

        s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
        common_phrases = []

        for block in s.get_matching_blocks():
            if block.size > 0:  # Ignore zero-length matches
                start_1 = block.a
                end_1 = block.a + block.size
                start_2 = block.b
                end_2 = block.b + block.size

                phrase = sentence_1[start_1:end_1]  # Or sentence_2[start_2:end_2], they are the same

                common_phrases.append({
                    "phrase": phrase,
                    "start_1": start_1 + position,
                    "end_1": end_1 + position,
                    "start_2": start_2,
                    "end_2": end_2,
                    "color": color,
                })
        position += len(sentence_1)
        return common_phrases, position

    def create_table(self):
        #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
        # loop of self.analyzed_table with index:
        rows = []
        max_length = 30  # TODO: put this in configuration
        rows.append(self.format_image_row(max_length))
        
        for index, row in enumerate(self.analyzed_table):
            formatted_row = self.format_text_row(row, index, max_length)
            rows.append(formatted_row)
        table = "\n".join(rows)
        return f"""
        <h5>Comparison between input news and source news</h5>
        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
            <thead>
                <tr>
                    <th>Input news</th>
                    <th>Source content</th>
                    <th>Forensic</th>
                    <th>Originality</th>
                </tr>
            </thead>
            <tbody>
                {table}
            </tbody>
        </table>
        
        <style>
    """
    
    def format_text_row(self, row, index = 0, max_length=30):
        input_sentence = self.highlight_text(row[0], row[2])  # text, index of highlight words
        source_sentence = self.highlight_text(row[1], row[3])  # text, index of highlight words
        
        url = self.aligned_sentences[index]["url"] #
        short_url = self.shorten_url(url, max_length)
        source_text_url = f"""<a href="{url}">{short_url}</a>"""
        
        # short_url = self.shorten_url(self.text_referent_url[index], max_length)
        # source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""
        
        self.text_prediction_score[index]
        return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>"""
        
    def format_image_row(self, max_length=30):        
        # input_image = f"""<img src="{self.news_image}" width="200" height="150">"""
        print(f"self.news_image = {self.news_image}")
        source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
        short_url = self.shorten_url(self.image_referent_url, max_length)
        source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
    
    def shorten_url(self, url, max_length=30):
        if url is None:
            return ""
        
        if len(url) > max_length:
            short_url = url[:max_length] + "..."
        else:
            short_url = url
        return short_url

    def highlight_text(self, words, indexes):
        final_words = words
        for index in indexes:
            final_words[index] = (
                f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
            )
        return " ".join(final_words)