news_verification / src /application /content_detection.py
pmkhanh7890's picture
Edit the demo
badcb49
raw
history blame
11.8 kB
from difflib import SequenceMatcher
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.search_detection import check_human, detect_text_by_relative_search
class NewsVerification():
def __init__(self):
self.news_text = ""
self.news_title = ""
self.news_content = ""
self.news_image = ""
self.text_prediction_label = ""
self.text_prediction_score = -1
self.text_referent_url = None
self.image_prediction_label = ""
self.image_prediction_score = -1
self.image_referent_url = None
self.news_prediction_label = ""
self.news_prediction_score = -1
self.found_img_url = []
self.aligned_sentences = []
self.is_paraphrased = False
def load_news(self, news_title, news_content, news_image):
self.news_text = news_title + "\n\n" + news_content
self.news_title = news_title
self.news_content = news_content
self.news_image = news_image
def determine_text_origin(self):
"""
Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
Args:
text: The input text to be analyzed.
Returns:
str: The predicted origin of the text:
- "HUMAN": If the text is likely written by a human.
- "MACHINE": If the text is likely generated by a machine.
"""
print("CHECK TEXT:")
print("\tFrom search engine:")
# Classify by search engine
self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
if self.is_paraphrased is False:
self.text_prediction_label = "UNKNOWN"
else:
self.text_prediction_score = 100
if check_human(self.aligned_sentences):
self.text_prediction_label = "HUMAN"
else:
self.text_prediction_label = "MACHINE"
# Classify text by AI model
print("\tFrom AI model:")
if self.text_prediction_label == "UNKNOWN":
self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
self.text_prediction_score *= 100
def detect_image_origin(self):
print("CHECK IMAGE:")
if self.news_image is None:
self.image_prediction_label = "UNKNOWN"
self.image_prediction_score = 0.0
self.image_referent_url = None
return
print(f"\t: Img path: {self.news_image}")
matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
if matched_url is not None:
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
self.image_prediction_label = "HUMAN"
self.image_prediction_score = similarity
self.image_referent_url = matched_url
return
matched_url, similarity = detect_image_by_reverse_search(self.news_image)
if matched_url is not None:
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
self.image_prediction_label = "HUMAN"
self.image_prediction_score = similarity
self.image_referent_url = matched_url
return
detected_label, score = detect_image_by_ai_model(self.news_image)
if detected_label:
self.image_prediction_label = detected_label
self.image_prediction_score = score
self.image_referent_url = None
return
self.image_prediction_label = "UNKNOWN"
self.image_prediction_score = 50
self.image_referent_url = None
def determine_news_origin(self):
if self.text_prediction_label == "MACHINE":
text_prediction_score = 100 - self.text_prediction_score
elif self.text_prediction_label == "UNKNOWN":
text_prediction_score = 50
else:
text_prediction_score = self.text_prediction_score
if self.image_prediction_label == "MACHINE":
image_prediction_score = 100 - self.image_prediction_score
elif self.image_prediction_label == "UNKNOWN":
image_prediction_score = 50
else:
image_prediction_score = self.image_prediction_score
news_prediction_score = (text_prediction_score + image_prediction_score) / 2
if news_prediction_score > 50:
self.news_prediction_score = news_prediction_score
self.news_prediction_label = "HUMAN"
else:
self.news_prediction_score = 100 - news_prediction_score
self.news_prediction_label = "MACHINE"
def generate_analysis_report(self):
self.determine_text_origin()
self.detect_image_origin()
self.determine_news_origin()
# Forensic analysis
if self.text_prediction_label == "MACHINE":
text_prediction_label = "The text is modified by GPT-4o (AI)"
else:
text_prediction_label = "The text is written by HUMAN"
if self.image_prediction_label == "MACHINE":
image_prediction_label = "The image is generated by Dall-e (AI)"
else:
image_prediction_label = "The image is generated by HUMAN"
if self.news_prediction_label == "MACHINE":
news_prediction_label = "The whole news generated by AI"
else:
news_prediction_label = "The whole news written by HUMAN"
# Misinformation analysis
out_of_context_results = "cohesive"
if out_of_context_results == "cohesive":
out_of_context_results = "The input news is cohesive (non-out-of-context)"
else:
out_of_context_results = "The input news is out-of-context"
out_of_context_prediction_score = 96.7
# Description
description = "The description should be concise, clear, and aimed at helping general readers understand the case."
if self.text_referent_url is None:
referred_news = "<li>No referent information</li>"
else:
if len(self.text_referent_url) > 40:
url_max_length = 40
else:
url_max_length = len(self.text_referent_url)
referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">{"Referred news: " + self.text_referent_url[:url_max_length] + "..."}</a></li>"""
if self.image_referent_url is None:
referred_image = "<li>No referent information</li>"
else:
if len(self.image_referent_url) > 40:
url_max_length = 40
else:
url_max_length = len(self.text_referent_url)
referred_image = f"""<li><a href="{self.image_referent_url}" target="_blank">{"Referred news: " + self.image_referent_url[:url_max_length] + "..."}</a></li>"""
html_template = f"""
<div>
<h3>Originality:</h3>
<ul>
{referred_news}
{referred_image}
</ul>
</div>
<div>
<h3>Forensic:</h3>
<b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
<ul>
<li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
<li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
</ul>
</div>
<div>
<h3>Misinformation (placeholder):</h3>
<ul>
<li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
</ul>
</div>
<div>
<h3>Description (optional, placeholder):</h3>
<ul>
<li>{description}</li>
</ul>
</div>
"""
return html_template
def analyze_details(self):
self.aligned_sentences
final_table = []
for pair in self.aligned_sentences:
input_words, source_words, input_indexes, source_indexes = (
self.highlight_overlap_by_word_to_list(
pair["input_sentence"],
pair["matched_sentence"],
)
)
final_table.append(
(input_words, source_words, input_indexes, source_indexes),
)
if len(final_table) != 0:
html_table = self.create_table(final_table)
else:
html_table = ""
return html_table
def highlight_overlap_by_word_to_list(self, text1, text2):
"""
Return
- list of words in text1
- list of words in text2
- list of index of highlight words in text 1
- list of index of highlight words in text 2
"""
# Tách chuỗi thành các từ (word) dựa vào khoảng trắng
words1 = text1.split()
words2 = text2.split()
index1 = []
index2 = []
# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
matcher = SequenceMatcher(None, words1, words2)
highlighted_text1 = []
highlighted_text2 = []
# Theo dõi vị trí hiện tại trong words1 và words2
current_pos1 = 0
current_pos2 = 0
# Lặp qua các đoạn so khớp
for match in matcher.get_matching_blocks():
start1, start2, length = match
# Thêm các từ không trùng lặp vào (giữ nguyên)
highlighted_text1.extend(words1[current_pos1:start1])
highlighted_text2.extend(words2[current_pos2:start2])
if length > 0:
for i in range(start1, start1 + length):
index1.append(i)
for i in range(start2, start2 + length):
index2.append(i)
# Cập nhật vị trí hiện tại
current_pos1 = start1 + length
current_pos2 = start2 + length
return words1, words2, index1, index2
def create_table(self, data):
table_rows = "\n".join([self.format_pair(pair) for pair in data])
return f"""
<h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
<thead>
<tr>
<th>Input sentence</th>
<th>Source sentence</th>
</tr>
</thead>
<tbody>
{table_rows}
</tbody>
</table>
"""
def format_pair(self, pair):
input_sentence = self.highlight_text(pair[0], pair[2])
source_sentence = self.highlight_text(pair[1], pair[3])
return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
def highlight_text(self, words, indexes):
final_words = words
for index in indexes:
final_words[index] = (
f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
)
return " ".join(final_words)