Spaces:
Sleeping
Sleeping
from difflib import SequenceMatcher | |
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image | |
from src.application.text.model_detection import detect_text_by_ai_model | |
from src.application.text.search_detection import check_human, detect_text_by_relative_search | |
class NewsVerification(): | |
def __init__(self): | |
self.news_text = "" | |
self.news_title = "" | |
self.news_content = "" | |
self.news_image = "" | |
self.text_prediction_label = "" | |
self.text_prediction_score = -1 | |
self.text_referent_url = None | |
self.image_prediction_label = "" | |
self.image_prediction_score = -1 | |
self.image_referent_url = None | |
self.news_prediction_label = "" | |
self.news_prediction_score = -1 | |
self.found_img_url = [] | |
self.aligned_sentences = [] | |
self.is_paraphrased = False | |
def load_news(self, news_title, news_content, news_image): | |
self.news_text = news_title + "\n\n" + news_content | |
self.news_title = news_title | |
self.news_content = news_content | |
self.news_image = news_image | |
def determine_text_origin(self): | |
""" | |
Determines the origin of the given text based on paraphrasing detection and human authorship analysis. | |
Args: | |
text: The input text to be analyzed. | |
Returns: | |
str: The predicted origin of the text: | |
- "HUMAN": If the text is likely written by a human. | |
- "MACHINE": If the text is likely generated by a machine. | |
""" | |
print("CHECK TEXT:") | |
print("\tFrom search engine:") | |
# Classify by search engine | |
self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text) | |
if self.is_paraphrased is False: | |
self.text_prediction_label = "UNKNOWN" | |
else: | |
self.text_prediction_score = 100 | |
if check_human(self.aligned_sentences): | |
self.text_prediction_label = "HUMAN" | |
else: | |
self.text_prediction_label = "MACHINE" | |
# Classify text by AI model | |
print("\tFrom AI model:") | |
if self.text_prediction_label == "UNKNOWN": | |
self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text) | |
self.text_prediction_score *= 100 | |
def detect_image_origin(self): | |
print("CHECK IMAGE:") | |
if self.news_image is None: | |
self.image_prediction_label = "UNKNOWN" | |
self.image_prediction_score = 0.0 | |
self.image_referent_url = None | |
return | |
print(f"\t: Img path: {self.news_image}") | |
matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url) | |
if matched_url is not None: | |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") | |
self.image_prediction_label = "HUMAN" | |
self.image_prediction_score = similarity | |
self.image_referent_url = matched_url | |
return | |
matched_url, similarity = detect_image_by_reverse_search(self.news_image) | |
if matched_url is not None: | |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") | |
self.image_prediction_label = "HUMAN" | |
self.image_prediction_score = similarity | |
self.image_referent_url = matched_url | |
return | |
detected_label, score = detect_image_by_ai_model(self.news_image) | |
if detected_label: | |
self.image_prediction_label = detected_label | |
self.image_prediction_score = score | |
self.image_referent_url = None | |
return | |
self.image_prediction_label = "UNKNOWN" | |
self.image_prediction_score = 50 | |
self.image_referent_url = None | |
def determine_news_origin(self): | |
if self.text_prediction_label == "MACHINE": | |
text_prediction_score = 100 - self.text_prediction_score | |
elif self.text_prediction_label == "UNKNOWN": | |
text_prediction_score = 50 | |
else: | |
text_prediction_score = self.text_prediction_score | |
if self.image_prediction_label == "MACHINE": | |
image_prediction_score = 100 - self.image_prediction_score | |
elif self.image_prediction_label == "UNKNOWN": | |
image_prediction_score = 50 | |
else: | |
image_prediction_score = self.image_prediction_score | |
news_prediction_score = (text_prediction_score + image_prediction_score) / 2 | |
if news_prediction_score > 50: | |
self.news_prediction_score = news_prediction_score | |
self.news_prediction_label = "HUMAN" | |
else: | |
self.news_prediction_score = 100 - news_prediction_score | |
self.news_prediction_label = "MACHINE" | |
def generate_analysis_report(self): | |
self.determine_text_origin() | |
self.detect_image_origin() | |
self.determine_news_origin() | |
# Forensic analysis | |
if self.text_prediction_label == "MACHINE": | |
text_prediction_label = "The text is modified by GPT-4o (AI)" | |
else: | |
text_prediction_label = "The text is written by HUMAN" | |
if self.image_prediction_label == "MACHINE": | |
image_prediction_label = "The image is generated by Dall-e (AI)" | |
else: | |
image_prediction_label = "The image is generated by HUMAN" | |
if self.news_prediction_label == "MACHINE": | |
news_prediction_label = "The whole news generated by AI" | |
else: | |
news_prediction_label = "The whole news written by HUMAN" | |
# Misinformation analysis | |
out_of_context_results = "cohesive" | |
if out_of_context_results == "cohesive": | |
out_of_context_results = "The input news is cohesive (non-out-of-context)" | |
else: | |
out_of_context_results = "The input news is out-of-context" | |
out_of_context_prediction_score = 96.7 | |
# Description | |
description = "The description should be concise, clear, and aimed at helping general readers understand the case." | |
if self.text_referent_url is None: | |
referred_news = "<li>No referent information</li>" | |
else: | |
if len(self.text_referent_url) > 40: | |
url_max_length = 40 | |
else: | |
url_max_length = len(self.text_referent_url) | |
referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">{"Referred news: " + self.text_referent_url[:url_max_length] + "..."}</a></li>""" | |
if self.image_referent_url is None: | |
referred_image = "<li>No referent information</li>" | |
else: | |
if len(self.image_referent_url) > 40: | |
url_max_length = 40 | |
else: | |
url_max_length = len(self.text_referent_url) | |
referred_image = f"""<li><a href="{self.image_referent_url}" target="_blank">{"Referred news: " + self.image_referent_url[:url_max_length] + "..."}</a></li>""" | |
html_template = f""" | |
<div> | |
<h3>Originality:</h3> | |
<ul> | |
{referred_news} | |
{referred_image} | |
</ul> | |
</div> | |
<div> | |
<h3>Forensic:</h3> | |
<b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b> | |
<ul> | |
<li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li> | |
<li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li> | |
</ul> | |
</div> | |
<div> | |
<h3>Misinformation (placeholder):</h3> | |
<ul> | |
<li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li> | |
</ul> | |
</div> | |
<div> | |
<h3>Description (optional, placeholder):</h3> | |
<ul> | |
<li>{description}</li> | |
</ul> | |
</div> | |
""" | |
return html_template | |
def analyze_details(self): | |
self.aligned_sentences | |
final_table = [] | |
for pair in self.aligned_sentences: | |
input_words, source_words, input_indexes, source_indexes = ( | |
self.highlight_overlap_by_word_to_list( | |
pair["input_sentence"], | |
pair["matched_sentence"], | |
) | |
) | |
final_table.append( | |
(input_words, source_words, input_indexes, source_indexes), | |
) | |
if len(final_table) != 0: | |
html_table = self.create_table(final_table) | |
else: | |
html_table = "" | |
return html_table | |
def highlight_overlap_by_word_to_list(self, text1, text2): | |
""" | |
Return | |
- list of words in text1 | |
- list of words in text2 | |
- list of index of highlight words in text 1 | |
- list of index of highlight words in text 2 | |
""" | |
# Tách chuỗi thành các từ (word) dựa vào khoảng trắng | |
words1 = text1.split() | |
words2 = text2.split() | |
index1 = [] | |
index2 = [] | |
# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ | |
matcher = SequenceMatcher(None, words1, words2) | |
highlighted_text1 = [] | |
highlighted_text2 = [] | |
# Theo dõi vị trí hiện tại trong words1 và words2 | |
current_pos1 = 0 | |
current_pos2 = 0 | |
# Lặp qua các đoạn so khớp | |
for match in matcher.get_matching_blocks(): | |
start1, start2, length = match | |
# Thêm các từ không trùng lặp vào (giữ nguyên) | |
highlighted_text1.extend(words1[current_pos1:start1]) | |
highlighted_text2.extend(words2[current_pos2:start2]) | |
if length > 0: | |
for i in range(start1, start1 + length): | |
index1.append(i) | |
for i in range(start2, start2 + length): | |
index2.append(i) | |
# Cập nhật vị trí hiện tại | |
current_pos1 = start1 + length | |
current_pos2 = start2 + length | |
return words1, words2, index1, index2 | |
def create_table(self, data): | |
table_rows = "\n".join([self.format_pair(pair) for pair in data]) | |
return f""" | |
<h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5> | |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;"> | |
<thead> | |
<tr> | |
<th>Input sentence</th> | |
<th>Source sentence</th> | |
</tr> | |
</thead> | |
<tbody> | |
{table_rows} | |
</tbody> | |
</table> | |
""" | |
def format_pair(self, pair): | |
input_sentence = self.highlight_text(pair[0], pair[2]) | |
source_sentence = self.highlight_text(pair[1], pair[3]) | |
return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>" | |
def highlight_text(self, words, indexes): | |
final_words = words | |
for index in indexes: | |
final_words[index] = ( | |
f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>" | |
) | |
return " ".join(final_words) | |