Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /content_detection.py

pmkhanh7890

Edit the demo

badcb49 4 months ago

raw

history blame

11.8 kB

	from difflib import SequenceMatcher
	from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
	from src.application.text.model_detection import detect_text_by_ai_model
	from src.application.text.search_detection import check_human, detect_text_by_relative_search


	class NewsVerification():
	def __init__(self):
	self.news_text = ""
	self.news_title = ""
	self.news_content = ""
	self.news_image = ""

	self.text_prediction_label = ""
	self.text_prediction_score = -1
	self.text_referent_url = None
	self.image_prediction_label = ""
	self.image_prediction_score = -1
	self.image_referent_url = None
	self.news_prediction_label = ""
	self.news_prediction_score = -1

	self.found_img_url = []
	self.aligned_sentences = []
	self.is_paraphrased = False

	def load_news(self, news_title, news_content, news_image):
	self.news_text = news_title + "\n\n" + news_content
	self.news_title = news_title
	self.news_content = news_content
	self.news_image = news_image

	def determine_text_origin(self):
	"""
	Determines the origin of the given text based on paraphrasing detection and human authorship analysis.

	Args:
	text: The input text to be analyzed.

	Returns:
	str: The predicted origin of the text:
	- "HUMAN": If the text is likely written by a human.
	- "MACHINE": If the text is likely generated by a machine.
	"""
	print("CHECK TEXT:")
	print("\tFrom search engine:")
	# Classify by search engine
	self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)

	if self.is_paraphrased is False:
	self.text_prediction_label = "UNKNOWN"
	else:
	self.text_prediction_score = 100
	if check_human(self.aligned_sentences):
	self.text_prediction_label = "HUMAN"
	else:
	self.text_prediction_label = "MACHINE"

	# Classify text by AI model
	print("\tFrom AI model:")
	if self.text_prediction_label == "UNKNOWN":
	self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
	self.text_prediction_score *= 100

	def detect_image_origin(self):
	print("CHECK IMAGE:")
	if self.news_image is None:
	self.image_prediction_label = "UNKNOWN"
	self.image_prediction_score = 0.0
	self.image_referent_url = None
	return

	print(f"\t: Img path: {self.news_image}")
	matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
	if matched_url is not None:
	print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
	self.image_prediction_label = "HUMAN"
	self.image_prediction_score = similarity
	self.image_referent_url = matched_url
	return

	matched_url, similarity = detect_image_by_reverse_search(self.news_image)
	if matched_url is not None:
	print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
	self.image_prediction_label = "HUMAN"
	self.image_prediction_score = similarity
	self.image_referent_url = matched_url
	return

	detected_label, score = detect_image_by_ai_model(self.news_image)
	if detected_label:
	self.image_prediction_label = detected_label
	self.image_prediction_score = score
	self.image_referent_url = None
	return

	self.image_prediction_label = "UNKNOWN"
	self.image_prediction_score = 50
	self.image_referent_url = None

	def determine_news_origin(self):
	if self.text_prediction_label == "MACHINE":
	text_prediction_score = 100 - self.text_prediction_score
	elif self.text_prediction_label == "UNKNOWN":
	text_prediction_score = 50
	else:
	text_prediction_score = self.text_prediction_score

	if self.image_prediction_label == "MACHINE":
	image_prediction_score = 100 - self.image_prediction_score
	elif self.image_prediction_label == "UNKNOWN":
	image_prediction_score = 50
	else:
	image_prediction_score = self.image_prediction_score

	news_prediction_score = (text_prediction_score + image_prediction_score) / 2
	if news_prediction_score > 50:
	self.news_prediction_score = news_prediction_score
	self.news_prediction_label = "HUMAN"
	else:
	self.news_prediction_score = 100 - news_prediction_score
	self.news_prediction_label = "MACHINE"

	def generate_analysis_report(self):
	self.determine_text_origin()
	self.detect_image_origin()
	self.determine_news_origin()

	# Forensic analysis
	if self.text_prediction_label == "MACHINE":
	text_prediction_label = "The text is modified by GPT-4o (AI)"
	else:
	text_prediction_label = "The text is written by HUMAN"

	if self.image_prediction_label == "MACHINE":
	image_prediction_label = "The image is generated by Dall-e (AI)"
	else:
	image_prediction_label = "The image is generated by HUMAN"

	if self.news_prediction_label == "MACHINE":
	news_prediction_label = "The whole news generated by AI"
	else:
	news_prediction_label = "The whole news written by HUMAN"

	# Misinformation analysis
	out_of_context_results = "cohesive"
	if out_of_context_results == "cohesive":
	out_of_context_results = "The input news is cohesive (non-out-of-context)"
	else:
	out_of_context_results = "The input news is out-of-context"
	out_of_context_prediction_score = 96.7

	# Description
	description = "The description should be concise, clear, and aimed at helping general readers understand the case."

	if self.text_referent_url is None:
	referred_news = "<li>No referent information</li>"
	else:
	if len(self.text_referent_url) > 40:
	url_max_length = 40
	else:
	url_max_length = len(self.text_referent_url)

	referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">{"Referred news: " + self.text_referent_url[:url_max_length] + "..."}</a></li>"""

	if self.image_referent_url is None:
	referred_image = "<li>No referent information</li>"
	else:
	if len(self.image_referent_url) > 40:
	url_max_length = 40
	else:
	url_max_length = len(self.text_referent_url)
	referred_image = f"""<li><a href="{self.image_referent_url}" target="_blank">{"Referred news: " + self.image_referent_url[:url_max_length] + "..."}</a></li>"""

	html_template = f"""
	<div>
	<h3>Originality:</h3>
	<ul>
	{referred_news}
	{referred_image}
	</ul>
	</div>

	<div>
	<h3>Forensic:</h3>
	<b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
	<ul>
	<li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
	<li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
	</ul>
	</div>

	<div>
	<h3>Misinformation (placeholder):</h3>
	<ul>
	<li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
	</ul>
	</div>

	<div>
	<h3>Description (optional, placeholder):</h3>
	<ul>
	<li>{description}</li>
	</ul>
	</div>
	"""

	return html_template


	def analyze_details(self):
	self.aligned_sentences
	final_table = []

	for pair in self.aligned_sentences:
	input_words, source_words, input_indexes, source_indexes = (
	self.highlight_overlap_by_word_to_list(
	pair["input_sentence"],
	pair["matched_sentence"],
	)
	)
	final_table.append(
	(input_words, source_words, input_indexes, source_indexes),
	)

	if len(final_table) != 0:
	html_table = self.create_table(final_table)
	else:
	html_table = ""
	return html_table

	def highlight_overlap_by_word_to_list(self, text1, text2):
	"""
	Return
	- list of words in text1
	- list of words in text2
	- list of index of highlight words in text 1
	- list of index of highlight words in text 2
	"""
	# Tách chuỗi thành các từ (word) dựa vào khoảng trắng
	words1 = text1.split()
	words2 = text2.split()

	index1 = []
	index2 = []

	# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
	matcher = SequenceMatcher(None, words1, words2)

	highlighted_text1 = []
	highlighted_text2 = []

	# Theo dõi vị trí hiện tại trong words1 và words2
	current_pos1 = 0
	current_pos2 = 0

	# Lặp qua các đoạn so khớp
	for match in matcher.get_matching_blocks():
	start1, start2, length = match

	# Thêm các từ không trùng lặp vào (giữ nguyên)
	highlighted_text1.extend(words1[current_pos1:start1])
	highlighted_text2.extend(words2[current_pos2:start2])

	if length > 0:
	for i in range(start1, start1 + length):
	index1.append(i)
	for i in range(start2, start2 + length):
	index2.append(i)

	# Cập nhật vị trí hiện tại
	current_pos1 = start1 + length
	current_pos2 = start2 + length

	return words1, words2, index1, index2

	def create_table(self, data):
	table_rows = "\n".join([self.format_pair(pair) for pair in data])
	return f"""
	<h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
	<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
	<thead>
	<tr>
	<th>Input sentence</th>
	<th>Source sentence</th>
	</tr>
	</thead>
	<tbody>
	{table_rows}
	</tbody>
	</table>
	"""

	def format_pair(self, pair):
	input_sentence = self.highlight_text(pair[0], pair[2])
	source_sentence = self.highlight_text(pair[1], pair[3])
	return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"

	def highlight_text(self, words, indexes):
	final_words = words
	for index in indexes:
	final_words[index] = (
	f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
	)
	return " ".join(final_words)