Spaces:
Sleeping
Sleeping
from difflib import SequenceMatcher | |
import difflib | |
from src.application.highlight_text import generate_color | |
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image | |
from src.application.text.model_detection import detect_text_by_ai_model | |
from src.application.text.preprocessing import split_into_sentences | |
from src.application.text.search_detection import check_human, detect_text_by_relative_search | |
class NewsVerification(): | |
def __init__(self): | |
self.news_text = "" | |
self.news_title = "" | |
self.news_content = "" | |
self.news_image = "" | |
self.text_prediction_label:list[str] = [] | |
self.text_prediction_score:list[float] = [] | |
self.text_referent_url:list[str] = [] | |
self.image_prediction_label:list[str] = [] | |
self.image_prediction_score:list[str] = [] | |
self.image_referent_url:list[str] = [] | |
self.news_prediction_label = "" | |
self.news_prediction_score = -1 | |
self.found_img_url:list[str] = [] | |
self.aligned_sentences:list[dict] = [] | |
self.is_paraphrased:list[bool] = [] | |
self.analyzed_table:list[list] = [] | |
def load_news(self, news_title, news_content, news_image): | |
self.news_text = news_title + "\n\n" + news_content | |
self.news_title = news_title | |
self.news_content = news_content | |
self.news_image = news_image | |
def determine_text_origin(self): | |
""" | |
Determines the origin of the given text based on paraphrasing detection and human authorship analysis. | |
Args: | |
text: The input text to be analyzed. | |
Returns: | |
str: The predicted origin of the text: | |
- "HUMAN": If the text is likely written by a human. | |
- "MACHINE": If the text is likely generated by a machine. | |
""" | |
print("CHECK TEXT:") | |
print("\tFrom search engine:") | |
# Classify by search engine | |
input_sentences = split_into_sentences(self.news_text) | |
for sentence in input_sentences: | |
paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence) | |
text_prediction_label = "UNKNOWN" | |
if paraphrase is False: | |
# Classify text by AI model | |
print("\tFrom AI model:") | |
text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence) | |
if aligned_sentence == []: | |
aligned_sentence = { | |
"input_sentence": sentence, | |
"matched_sentence": "", | |
"similarity": text_prediction_score, | |
"is_paraphrase_sentence": False, | |
"url": "", | |
} | |
else: | |
self.found_img_url.extend(img_urls) | |
text_prediction_score = aligned_sentence["similarity"] | |
if check_human(aligned_sentence): | |
text_prediction_label = "HUMAN" | |
else: | |
text_prediction_label = "MACHINE" | |
print(f"\ttext_prediction_label: {text_prediction_label}\n") | |
self.text_prediction_label.append(text_prediction_label) | |
self.aligned_sentences.append(aligned_sentence) | |
self.is_paraphrased.append(paraphrase) | |
self.text_referent_url.append(text_url) | |
self.text_prediction_score.append(text_prediction_score) | |
paraphrase = False | |
text_url = "" | |
aligned_sentence = {} | |
img_urls = [] | |
def detect_image_origin(self): | |
print("CHECK IMAGE:") | |
if self.news_image is None: | |
self.image_prediction_label = "UNKNOWN" | |
self.image_prediction_score = 0.0 | |
self.image_referent_url = None | |
return | |
print(f"\t: Img path: {self.news_image}") | |
matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url) | |
if matched_url is not None: | |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") | |
self.image_prediction_label = "HUMAN" | |
self.image_prediction_score = similarity | |
self.image_referent_url = matched_url | |
return | |
matched_url, similarity = detect_image_by_reverse_search(self.news_image) | |
if matched_url is not None: | |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") | |
self.image_prediction_label = "HUMAN" | |
self.image_prediction_score = similarity | |
self.image_referent_url = matched_url | |
return | |
detected_label, score = detect_image_by_ai_model(self.news_image) | |
if detected_label: | |
self.image_prediction_label = detected_label | |
self.image_prediction_score = score | |
self.image_referent_url = None | |
return | |
self.image_prediction_label = "UNKNOWN" | |
self.image_prediction_score = 50 | |
self.image_referent_url = None | |
def determine_news_origin(self): | |
if self.text_prediction_label == "MACHINE": | |
text_prediction_score = 100 - self.text_prediction_score | |
elif self.text_prediction_label == "UNKNOWN": | |
text_prediction_score = 50 | |
else: | |
text_prediction_score = self.text_prediction_score | |
if self.image_prediction_label == "MACHINE": | |
image_prediction_score = 100 - self.image_prediction_score | |
elif self.image_prediction_label == "UNKNOWN": | |
image_prediction_score = 50 | |
else: | |
image_prediction_score = self.image_prediction_score | |
news_prediction_score = (text_prediction_score + image_prediction_score) / 2 | |
if news_prediction_score > 50: | |
self.news_prediction_score = news_prediction_score | |
self.news_prediction_label = "HUMAN" | |
else: | |
self.news_prediction_score = 100 - news_prediction_score | |
self.news_prediction_label = "MACHINE" | |
def generate_analysis_report(self): | |
self.determine_text_origin() | |
self.detect_image_origin() | |
def analyze_details(self): | |
self.analyzed_table = [] | |
# IMAGES: | |
# TEXT | |
for pair in self.aligned_sentences: | |
print(f"pair: {pair}") | |
if "input_sentence" not in pair: | |
continue | |
input_words, source_words, input_indexes, source_indexes = ( | |
self.highlight_overlap_by_word_to_list( | |
pair["input_sentence"], | |
pair["matched_sentence"], | |
) | |
# self.compare_sentences( | |
# pair["input_sentence"], | |
# pair["matched_sentence"], | |
# ) | |
) | |
self.analyzed_table.append( | |
(input_words, source_words, input_indexes, source_indexes), | |
) | |
if len(self.analyzed_table) != 0: | |
html_table = self.create_table() | |
else: | |
html_table = "" | |
return html_table | |
def highlight_overlap_by_word_to_list(self, text1, text2): | |
""" | |
Return | |
- list of words in text1 | |
- list of words in text2 | |
- list of index of highlight words in text 1 | |
- list of index of highlight words in text 2 | |
""" | |
# Tách chuỗi thành các từ (word) dựa vào khoảng trắng | |
words1 = text1.split() | |
words2 = text2.split() | |
index1 = [] | |
index2 = [] | |
# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ | |
matcher = SequenceMatcher(None, words1, words2) | |
highlighted_text1 = [] | |
highlighted_text2 = [] | |
# Theo dõi vị trí hiện tại trong words1 và words2 | |
current_pos1 = 0 | |
current_pos2 = 0 | |
# Lặp qua các đoạn so khớp | |
for match in matcher.get_matching_blocks(): | |
start1, start2, length = match | |
print(start1, start2, length) | |
# Thêm các từ không trùng lặp vào (giữ nguyên) | |
highlighted_text1.extend(words1[current_pos1:start1]) | |
highlighted_text2.extend(words2[current_pos2:start2]) | |
if length > 0: | |
for i in range(start1, start1 + length): | |
index1.append(i) | |
for i in range(start2, start2 + length): | |
index2.append(i) | |
# Cập nhật vị trí hiện tại | |
current_pos1 = start1 + length | |
current_pos2 = start2 + length | |
return words1, words2, index1, index2 | |
def get_text_urls(self): | |
return set(self.text_referent_url) | |
def generate_colors_list(self, set_urls): | |
color_dict = {} | |
num_urls = len(set_urls) | |
for i in range(num_urls): | |
color_dict[i] = generate_color(i, num_urls) | |
return color_dict | |
def analyze_details_2(self): | |
html_text = "" | |
self.analyzed_table = [] | |
# TEXT | |
# Assign unique colors to each index | |
set_urls = self.get_text_urls() | |
color_dict = self.generate_colors_list(set_urls) | |
# position of the color in the input contents | |
position = 0 | |
for pair in self.aligned_sentences: | |
if "input_sentence" not in pair: | |
continue | |
common_phrases, position = self.compare_sentences( | |
pair["input_sentence"], | |
pair["matched_sentence"], | |
position, | |
color_dict["0"], # TODO: set color | |
) | |
if len(self.analyzed_table) != 0: | |
html_table = self.create_table() | |
else: | |
html_table = "" | |
return html_text, html_table | |
def compare_sentences(self, sentence_1, sentence_2, position, color): | |
""" | |
Compares two sentences and identifies common phrases, outputting their start and end positions. | |
Args: | |
sentence_1: The first sentence (string). | |
sentence_2: The second sentence (string). | |
Returns: | |
A list of dictionaries, where each dictionary represents a common phrase and contains: | |
- "phrase": The common phrase (string). | |
- "start_1": The starting index of the phrase in sentence_1 (int). | |
- "end_1": The ending index of the phrase in sentence_1 (int). | |
- "start_2": The starting index of the phrase in sentence_2 (int). | |
- "end_2": The ending index of the phrase in sentence_2 (int). | |
Returns an empty list if no common phrases are found. Handles edge cases like empty strings. | |
""" | |
if not sentence_1 or not sentence_2: # Handle empty strings | |
return [] | |
s = difflib.SequenceMatcher(None, sentence_1, sentence_2) | |
common_phrases = [] | |
for block in s.get_matching_blocks(): | |
if block.size > 0: # Ignore zero-length matches | |
start_1 = block.a | |
end_1 = block.a + block.size | |
start_2 = block.b | |
end_2 = block.b + block.size | |
phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same | |
common_phrases.append({ | |
"phrase": phrase, | |
"start_1": start_1 + position, | |
"end_1": end_1 + position, | |
"start_2": start_2, | |
"end_2": end_2, | |
"color": color, | |
}) | |
position += len(sentence_1) | |
return common_phrases, position | |
def create_table(self): | |
#table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table]) | |
# loop of self.analyzed_table with index: | |
rows = [] | |
max_length = 30 # TODO: put this in configuration | |
rows.append(self.format_image_row(max_length)) | |
for index, row in enumerate(self.analyzed_table): | |
formatted_row = self.format_text_row(row, index, max_length) | |
rows.append(formatted_row) | |
table = "\n".join(rows) | |
return f""" | |
<h5>Comparison between input news and source news</h5> | |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;"> | |
<thead> | |
<tr> | |
<th>Input news</th> | |
<th>Source content</th> | |
<th>Forensic</th> | |
<th>Originality</th> | |
</tr> | |
</thead> | |
<tbody> | |
{table} | |
</tbody> | |
</table> | |
<style> | |
""" | |
def format_text_row(self, row, index = 0, max_length=30): | |
input_sentence = self.highlight_text(row[0], row[2]) # text, index of highlight words | |
source_sentence = self.highlight_text(row[1], row[3]) # text, index of highlight words | |
url = self.aligned_sentences[index]["url"] # | |
short_url = self.shorten_url(url, max_length) | |
source_text_url = f"""<a href="{url}">{short_url}</a>""" | |
# short_url = self.shorten_url(self.text_referent_url[index], max_length) | |
# source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>""" | |
self.text_prediction_score[index] | |
return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>""" | |
def format_image_row(self, max_length=30): | |
# input_image = f"""<img src="{self.news_image}" width="200" height="150">""" | |
print(f"self.news_image = {self.news_image}") | |
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" | |
short_url = self.shorten_url(self.image_referent_url, max_length) | |
source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>""" | |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" | |
def shorten_url(self, url, max_length=30): | |
if url is None: | |
return "" | |
if len(url) > max_length: | |
short_url = url[:max_length] + "..." | |
else: | |
short_url = url | |
return short_url | |
def highlight_text(self, words, indexes): | |
final_words = words | |
for index in indexes: | |
final_words[index] = ( | |
f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>" | |
) | |
return " ".join(final_words) | |