from difflib import SequenceMatcher import difflib from src.application.highlight_text import generate_color from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image from src.application.text.model_detection import detect_text_by_ai_model from src.application.text.preprocessing import split_into_sentences from src.application.text.search_detection import check_human, detect_text_by_relative_search class NewsVerification(): def __init__(self): self.news_text = "" self.news_title = "" self.news_content = "" self.news_image = "" self.text_prediction_label:list[str] = [] self.text_prediction_score:list[float] = [] self.text_referent_url:list[str] = [] self.image_prediction_label:list[str] = [] self.image_prediction_score:list[str] = [] self.image_referent_url:list[str] = [] self.news_prediction_label = "" self.news_prediction_score = -1 self.found_img_url:list[str] = [] self.aligned_sentences:list[dict] = [] self.is_paraphrased:list[bool] = [] self.analyzed_table:list[list] = [] def load_news(self, news_title, news_content, news_image): self.news_text = news_title + "\n\n" + news_content self.news_title = news_title self.news_content = news_content self.news_image = news_image def determine_text_origin(self): """ Determines the origin of the given text based on paraphrasing detection and human authorship analysis. Args: text: The input text to be analyzed. Returns: str: The predicted origin of the text: - "HUMAN": If the text is likely written by a human. - "MACHINE": If the text is likely generated by a machine. """ print("CHECK TEXT:") print("\tFrom search engine:") # Classify by search engine input_sentences = split_into_sentences(self.news_text) for sentence in input_sentences: paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence) text_prediction_label = "UNKNOWN" if paraphrase is False: # Classify text by AI model print("\tFrom AI model:") text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence) if aligned_sentence == []: aligned_sentence = { "input_sentence": sentence, "matched_sentence": "", "similarity": text_prediction_score, "is_paraphrase_sentence": False, "url": "", } else: self.found_img_url.extend(img_urls) text_prediction_score = aligned_sentence["similarity"] if check_human(aligned_sentence): text_prediction_label = "HUMAN" else: text_prediction_label = "MACHINE" print(f"\ttext_prediction_label: {text_prediction_label}\n") self.text_prediction_label.append(text_prediction_label) self.aligned_sentences.append(aligned_sentence) self.is_paraphrased.append(paraphrase) self.text_referent_url.append(text_url) self.text_prediction_score.append(text_prediction_score) paraphrase = False text_url = "" aligned_sentence = {} img_urls = [] def detect_image_origin(self): print("CHECK IMAGE:") if self.news_image is None: self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 0.0 self.image_referent_url = None return print(f"\t: Img path: {self.news_image}") matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return matched_url, similarity = detect_image_by_reverse_search(self.news_image) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return detected_label, score = detect_image_by_ai_model(self.news_image) if detected_label: self.image_prediction_label = detected_label self.image_prediction_score = score self.image_referent_url = None return self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 50 self.image_referent_url = None def determine_news_origin(self): if self.text_prediction_label == "MACHINE": text_prediction_score = 100 - self.text_prediction_score elif self.text_prediction_label == "UNKNOWN": text_prediction_score = 50 else: text_prediction_score = self.text_prediction_score if self.image_prediction_label == "MACHINE": image_prediction_score = 100 - self.image_prediction_score elif self.image_prediction_label == "UNKNOWN": image_prediction_score = 50 else: image_prediction_score = self.image_prediction_score news_prediction_score = (text_prediction_score + image_prediction_score) / 2 if news_prediction_score > 50: self.news_prediction_score = news_prediction_score self.news_prediction_label = "HUMAN" else: self.news_prediction_score = 100 - news_prediction_score self.news_prediction_label = "MACHINE" def generate_analysis_report(self): self.determine_text_origin() self.detect_image_origin() def analyze_details(self): self.analyzed_table = [] # IMAGES: # TEXT for pair in self.aligned_sentences: print(f"pair: {pair}") if "input_sentence" not in pair: continue input_words, source_words, input_indexes, source_indexes = ( self.highlight_overlap_by_word_to_list( pair["input_sentence"], pair["matched_sentence"], ) # self.compare_sentences( # pair["input_sentence"], # pair["matched_sentence"], # ) ) self.analyzed_table.append( (input_words, source_words, input_indexes, source_indexes), ) if len(self.analyzed_table) != 0: html_table = self.create_table() else: html_table = "" return html_table def highlight_overlap_by_word_to_list(self, text1, text2): """ Return - list of words in text1 - list of words in text2 - list of index of highlight words in text 1 - list of index of highlight words in text 2 """ # Tách chuỗi thành các từ (word) dựa vào khoảng trắng words1 = text1.split() words2 = text2.split() index1 = [] index2 = [] # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ matcher = SequenceMatcher(None, words1, words2) highlighted_text1 = [] highlighted_text2 = [] # Theo dõi vị trí hiện tại trong words1 và words2 current_pos1 = 0 current_pos2 = 0 # Lặp qua các đoạn so khớp for match in matcher.get_matching_blocks(): start1, start2, length = match print(start1, start2, length) # Thêm các từ không trùng lặp vào (giữ nguyên) highlighted_text1.extend(words1[current_pos1:start1]) highlighted_text2.extend(words2[current_pos2:start2]) if length > 0: for i in range(start1, start1 + length): index1.append(i) for i in range(start2, start2 + length): index2.append(i) # Cập nhật vị trí hiện tại current_pos1 = start1 + length current_pos2 = start2 + length return words1, words2, index1, index2 def get_text_urls(self): return set(self.text_referent_url) def generate_colors_list(self, set_urls): color_dict = {} num_urls = len(set_urls) for i in range(num_urls): color_dict[i] = generate_color(i, num_urls) return color_dict def analyze_details_2(self): html_text = "" self.analyzed_table = [] # TEXT # Assign unique colors to each index set_urls = self.get_text_urls() color_dict = self.generate_colors_list(set_urls) # position of the color in the input contents position = 0 for pair in self.aligned_sentences: if "input_sentence" not in pair: continue common_phrases, position = self.compare_sentences( pair["input_sentence"], pair["matched_sentence"], position, color_dict["0"], # TODO: set color ) if len(self.analyzed_table) != 0: html_table = self.create_table() else: html_table = "" return html_text, html_table def compare_sentences(self, sentence_1, sentence_2, position, color): """ Compares two sentences and identifies common phrases, outputting their start and end positions. Args: sentence_1: The first sentence (string). sentence_2: The second sentence (string). Returns: A list of dictionaries, where each dictionary represents a common phrase and contains: - "phrase": The common phrase (string). - "start_1": The starting index of the phrase in sentence_1 (int). - "end_1": The ending index of the phrase in sentence_1 (int). - "start_2": The starting index of the phrase in sentence_2 (int). - "end_2": The ending index of the phrase in sentence_2 (int). Returns an empty list if no common phrases are found. Handles edge cases like empty strings. """ if not sentence_1 or not sentence_2: # Handle empty strings return [] s = difflib.SequenceMatcher(None, sentence_1, sentence_2) common_phrases = [] for block in s.get_matching_blocks(): if block.size > 0: # Ignore zero-length matches start_1 = block.a end_1 = block.a + block.size start_2 = block.b end_2 = block.b + block.size phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same common_phrases.append({ "phrase": phrase, "start_1": start_1 + position, "end_1": end_1 + position, "start_2": start_2, "end_2": end_2, "color": color, }) position += len(sentence_1) return common_phrases, position def create_table(self): #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table]) # loop of self.analyzed_table with index: rows = [] max_length = 30 # TODO: put this in configuration rows.append(self.format_image_row(max_length)) for index, row in enumerate(self.analyzed_table): formatted_row = self.format_text_row(row, index, max_length) rows.append(formatted_row) table = "\n".join(rows) return f"""
Input news | Source content | Forensic | Originality |
---|