from difflib import SequenceMatcher import pandas as pd from src.application.image.image_detection import ( detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image, ) from src.application.text.entity import ( apply_highlight, highlight_entities, ) from src.application.text.helper import extract_equal_text from src.application.text.model_detection import detect_text_by_ai_model from src.application.text.preprocessing import split_into_paragraphs from src.application.text.search_detection import ( check_human, detect_text_by_relative_search, find_text_source, ) class NewsVerification: def __init__(self): self.news_text = "" self.news_title = "" self.news_content = "" self.news_image = "" self.text_prediction_label: list[str] = [] self.text_prediction_score: list[float] = [] self.text_referent_url: list[str] = [] self.image_prediction_label: list[str] = [] self.image_prediction_score: list[str] = [] self.image_referent_url: list[str] = [] self.news_prediction_label = "" self.news_prediction_score = -1 self.found_img_url: list[str] = [] self.aligned_sentences: list[dict] = [] self.aligned_sentences_df: pd.DataFrame = pd.DataFrame( columns=[ "input_sentence", "matched_sentence", "label", "similarity", "paraphrase", "url", "group", "entities", ], ) self.is_paraphrased: list[bool] = [] self.ordinary_user_table: list = [] self.fact_checker_table: list = [] self.governor_table: list = [] self.entities_with_colors = [] def load_news(self, news_title, news_content, news_image): self.news_text = news_title + "\n\n" + news_content self.news_title = news_title self.news_content = news_content self.news_image = news_image def determine_text_origin(self): """ Determines the origin of the given text based on paraphrasing detection and human authorship analysis. Args: text: The input text to be analyzed. Returns: str: The predicted origin of the text: - "HUMAN": If the text is likely written by a human. - "MACHINE": If the text is likely generated by a machine. """ print("CHECK TEXT:") print("\tFrom search engine:") # Classify by search engine input_sentences = split_into_paragraphs(self.news_text) current_index = 0 previous_paraphrase = None ai_sentence = { "input_sentence": "", "matched_sentence": "", "label": "", "similarity": None, "paraphrase": False, "url": "", } for index, sentence in enumerate(input_sentences): print(f"-------index = {index}-------") print(f"current_sentence = {input_sentences[index]}") if current_index >= len(input_sentences): break if ( current_index > index and index != 0 and index != len(input_sentences) - 1 ): continue ( paraphrase, text_url, searched_sentences, img_urls, current_index, ) = detect_text_by_relative_search(input_sentences, index) if paraphrase is False: # add sentence to ai_sentence if ai_sentence["input_sentence"] != "": ai_sentence["input_sentence"] += "
" ai_sentence["input_sentence"] += sentence if index == len(input_sentences) - 1: # add ai_sentences to align_sentences text_prediction_label, text_prediction_score = ( detect_text_by_ai_model(ai_sentence["input_sentence"]) ) ai_sentence["label"] = text_prediction_label ai_sentence["similarity"] = text_prediction_score self.aligned_sentences.append(ai_sentence) else: if previous_paraphrase is False or previous_paraphrase is None: # add ai_sentences to align_sentences if ai_sentence[ "input_sentence" ] != "" or current_index >= len(input_sentences): text_prediction_label, text_prediction_score = ( detect_text_by_ai_model( ai_sentence["input_sentence"], ) ) ai_sentence["label"] = text_prediction_label ai_sentence["similarity"] = text_prediction_score self.aligned_sentences.append(ai_sentence) # reset ai_sentence = { "input_sentence": "", "matched_sentence": "", "label": "", "similarity": None, "paraphrase": False, "url": "", } # add searched_sentences to align_sentences if searched_sentences["input_sentence"] != "": self.found_img_url.extend(img_urls) if check_human(searched_sentences): searched_sentences["label"] = "HUMAN" else: searched_sentences["label"] = "MACHINE" self.aligned_sentences.append(searched_sentences) previous_paraphrase = paraphrase def determine_text_origin_2(self): """ Determines the origin of the given text based on paraphrasing detection and human authorship analysis. Args: text: The input text to be analyzed. Returns: str: The predicted origin of the text: - "HUMAN": If the text is likely written by a human. - "MACHINE": If the text is likely generated by a machine. """ print("CHECK TEXT:") print("\tFrom search engine:") # Classify by search engine input_sentences = split_into_paragraphs(self.news_text) for _ in range(5): self.aligned_sentences_df = pd.concat( [self.aligned_sentences_df, pd.DataFrame([{}])], ignore_index=False, ) for index, sentence in enumerate(input_sentences): print(f"-------index = {index}-------") print(f"current_sentence = {input_sentences[index]}") if self.aligned_sentences_df["url"] is not None: continue self.aligned_sentences_df, img_urls = find_text_source( input_sentences[index], self.aligned_sentences_df, ) def detect_image_origin(self): print("CHECK IMAGE:") if self.news_image is None: self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 0.0 self.image_referent_url = None return for image in self.found_img_url: print(f"\tfound_img_url: {image}") matched_url, similarity = detect_image_from_news_image( self.news_image, self.found_img_url, ) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return matched_url, similarity = detect_image_by_reverse_search( self.news_image, ) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return detected_label, score = detect_image_by_ai_model(self.news_image) if detected_label: print(f"detected_label: {detected_label} ({score})") self.image_prediction_label = detected_label self.image_prediction_score = score self.image_referent_url = None return self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 50 self.image_referent_url = None def determine_news_origin(self): if self.text_prediction_label == "MACHINE": text_prediction_score = 100 - self.text_prediction_score elif self.text_prediction_label == "UNKNOWN": text_prediction_score = 50 else: text_prediction_score = self.text_prediction_score if self.image_prediction_label == "MACHINE": image_prediction_score = 100 - self.image_prediction_score elif self.image_prediction_label == "UNKNOWN": image_prediction_score = 50 else: image_prediction_score = self.image_prediction_score news_prediction_score = ( text_prediction_score + image_prediction_score ) / 2 if news_prediction_score > 50: self.news_prediction_score = news_prediction_score self.news_prediction_label = "HUMAN" else: self.news_prediction_score = 100 - news_prediction_score self.news_prediction_label = "MACHINE" def generate_analysis_report(self): self.determine_text_origin() self.detect_image_origin() def analyze_details(self): entities_with_colors = [] for index, aligned_sentence in enumerate(self.aligned_sentences): # Get entity-words (in pair) with colors entities_with_colors = highlight_entities( aligned_sentence["input_sentence"], aligned_sentence["matched_sentence"], ) self.aligned_sentences[index]["entities"] = entities_with_colors ordinary_user_table = self.create_ordinary_user_table() fact_checker_table = self.create_fact_checker_table() governor_table = self.create_governor_table() return ordinary_user_table, fact_checker_table, governor_table def get_text_urls(self): return set(self.text_referent_url) def compare_sentences(self, sentence_1, sentence_2, position, color): """ Compares two sentences and identifies common phrases, outputting their start and end positions. """ if not sentence_1 or not sentence_2: # Handle empty strings return [] s = SequenceMatcher(None, sentence_1, sentence_2) common_phrases = [] for block in s.get_matching_blocks(): if block.size > 0: # Ignore zero-length matches start_1 = block.a end_1 = block.a + block.size start_2 = block.b end_2 = block.b + block.size phrase = sentence_1[ start_1:end_1 ] # Or sentence_2[start_2:end_2], they are the same common_phrases.append( { "phrase": phrase, "start_1": start_1 + position, "end_1": end_1 + position, "start_2": start_2, "end_2": end_2, "color": color, }, ) position += len(sentence_1) return common_phrases, position def create_fact_checker_table(self): rows = [] max_length = 30 # TODO: put this in configuration rows.append(self.format_image_fact_checker_row(max_length)) for aligned_sentence in self.aligned_sentences: if "input_sentence" not in aligned_sentence: continue # Get index of equal phrases in input and source sentences equal_idx_1, equal_idx_2 = extract_equal_text( aligned_sentence["input_sentence"], aligned_sentence["matched_sentence"], ) # Get entity-words (in pair) with colors # entities_with_colors = highlight_entities( # aligned_sentence["input_sentence"], # aligned_sentence["matched_sentence"], # ) self.fact_checker_table.append( [ aligned_sentence, equal_idx_1, equal_idx_2, aligned_sentence["entities"], ], ) for row in self.fact_checker_table: formatted_row = self.format_text_fact_checker_row(row, max_length) rows.append(formatted_row) table = "\n".join(rows) return f"""
Comparison between input news and source news:
{table}
Input news Source (corresponding URL provided in Originality) Forensic Originality