Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /content_detection.py

pmkhanh7890

revise demo

d952fbe 3 months ago

raw

history blame

15.1 kB

	from difflib import SequenceMatcher
	import difflib
	from src.application.highlight_text import generate_color
	from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
	from src.application.text.model_detection import detect_text_by_ai_model
	from src.application.text.preprocessing import split_into_sentences
	from src.application.text.search_detection import check_human, detect_text_by_relative_search


	class NewsVerification():
	def __init__(self):
	self.news_text = ""
	self.news_title = ""
	self.news_content = ""
	self.news_image = ""

	self.text_prediction_label:list[str] = []
	self.text_prediction_score:list[float] = []
	self.text_referent_url:list[str] = []
	self.image_prediction_label:list[str] = []
	self.image_prediction_score:list[str] = []
	self.image_referent_url:list[str] = []
	self.news_prediction_label = ""
	self.news_prediction_score = -1

	self.found_img_url:list[str] = []
	self.aligned_sentences:list[dict] = []
	self.is_paraphrased:list[bool] = []
	self.analyzed_table:list[list] = []

	def load_news(self, news_title, news_content, news_image):
	self.news_text = news_title + "\n\n" + news_content
	self.news_title = news_title
	self.news_content = news_content
	self.news_image = news_image

	def determine_text_origin(self):
	"""
	Determines the origin of the given text based on paraphrasing detection and human authorship analysis.

	Args:
	text: The input text to be analyzed.

	Returns:
	str: The predicted origin of the text:
	- "HUMAN": If the text is likely written by a human.
	- "MACHINE": If the text is likely generated by a machine.
	"""
	print("CHECK TEXT:")
	print("\tFrom search engine:")
	# Classify by search engine
	input_sentences = split_into_sentences(self.news_text)
	for sentence in input_sentences:
	paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence)

	text_prediction_label = "UNKNOWN"
	if paraphrase is False:
	# Classify text by AI model
	print("\tFrom AI model:")
	text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence)
	if aligned_sentence == []:
	aligned_sentence = {
	"input_sentence": sentence,
	"matched_sentence": "",
	"similarity": text_prediction_score,
	"is_paraphrase_sentence": False,
	"url": "",
	}
	else:
	self.found_img_url.extend(img_urls)
	text_prediction_score = aligned_sentence["similarity"]
	if check_human(aligned_sentence):
	text_prediction_label = "HUMAN"
	else:
	text_prediction_label = "MACHINE"

	print(f"\ttext_prediction_label: {text_prediction_label}\n")
	self.text_prediction_label.append(text_prediction_label)
	self.aligned_sentences.append(aligned_sentence)
	self.is_paraphrased.append(paraphrase)
	self.text_referent_url.append(text_url)
	self.text_prediction_score.append(text_prediction_score)
	paraphrase = False
	text_url = ""
	aligned_sentence = {}
	img_urls = []

	def detect_image_origin(self):
	print("CHECK IMAGE:")
	if self.news_image is None:
	self.image_prediction_label = "UNKNOWN"
	self.image_prediction_score = 0.0
	self.image_referent_url = None
	return

	print(f"\t: Img path: {self.news_image}")
	matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
	if matched_url is not None:
	print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
	self.image_prediction_label = "HUMAN"
	self.image_prediction_score = similarity
	self.image_referent_url = matched_url
	return

	matched_url, similarity = detect_image_by_reverse_search(self.news_image)
	if matched_url is not None:
	print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
	self.image_prediction_label = "HUMAN"
	self.image_prediction_score = similarity
	self.image_referent_url = matched_url
	return

	detected_label, score = detect_image_by_ai_model(self.news_image)
	if detected_label:
	self.image_prediction_label = detected_label
	self.image_prediction_score = score
	self.image_referent_url = None
	return

	self.image_prediction_label = "UNKNOWN"
	self.image_prediction_score = 50
	self.image_referent_url = None

	def determine_news_origin(self):
	if self.text_prediction_label == "MACHINE":
	text_prediction_score = 100 - self.text_prediction_score
	elif self.text_prediction_label == "UNKNOWN":
	text_prediction_score = 50
	else:
	text_prediction_score = self.text_prediction_score

	if self.image_prediction_label == "MACHINE":
	image_prediction_score = 100 - self.image_prediction_score
	elif self.image_prediction_label == "UNKNOWN":
	image_prediction_score = 50
	else:
	image_prediction_score = self.image_prediction_score

	news_prediction_score = (text_prediction_score + image_prediction_score) / 2
	if news_prediction_score > 50:
	self.news_prediction_score = news_prediction_score
	self.news_prediction_label = "HUMAN"
	else:
	self.news_prediction_score = 100 - news_prediction_score
	self.news_prediction_label = "MACHINE"

	def generate_analysis_report(self):
	self.determine_text_origin()
	self.detect_image_origin()

	def analyze_details(self):
	self.analyzed_table = []
	# IMAGES:


	# TEXT
	for pair in self.aligned_sentences:
	print(f"pair: {pair}")
	if "input_sentence" not in pair:
	continue
	input_words, source_words, input_indexes, source_indexes = (
	self.highlight_overlap_by_word_to_list(
	pair["input_sentence"],
	pair["matched_sentence"],
	)
	# self.compare_sentences(
	# pair["input_sentence"],
	# pair["matched_sentence"],
	# )
	)
	self.analyzed_table.append(
	(input_words, source_words, input_indexes, source_indexes),
	)

	if len(self.analyzed_table) != 0:
	html_table = self.create_table()
	else:
	html_table = ""
	return html_table

	def highlight_overlap_by_word_to_list(self, text1, text2):
	"""
	Return
	- list of words in text1
	- list of words in text2
	- list of index of highlight words in text 1
	- list of index of highlight words in text 2
	"""
	# Tách chuỗi thành các từ (word) dựa vào khoảng trắng
	words1 = text1.split()
	words2 = text2.split()

	index1 = []
	index2 = []

	# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
	matcher = SequenceMatcher(None, words1, words2)

	highlighted_text1 = []
	highlighted_text2 = []

	# Theo dõi vị trí hiện tại trong words1 và words2
	current_pos1 = 0
	current_pos2 = 0

	# Lặp qua các đoạn so khớp
	for match in matcher.get_matching_blocks():
	start1, start2, length = match
	print(start1, start2, length)

	# Thêm các từ không trùng lặp vào (giữ nguyên)
	highlighted_text1.extend(words1[current_pos1:start1])
	highlighted_text2.extend(words2[current_pos2:start2])

	if length > 0:
	for i in range(start1, start1 + length):
	index1.append(i)
	for i in range(start2, start2 + length):
	index2.append(i)

	# Cập nhật vị trí hiện tại
	current_pos1 = start1 + length
	current_pos2 = start2 + length

	return words1, words2, index1, index2


	def get_text_urls(self):
	return set(self.text_referent_url)

	def generate_colors_list(self, set_urls):
	color_dict = {}
	num_urls = len(set_urls)
	for i in range(num_urls):
	color_dict[i] = generate_color(i, num_urls)

	return color_dict

	def analyze_details_2(self):
	html_text = ""

	self.analyzed_table = []
	# TEXT
	# Assign unique colors to each index
	set_urls = self.get_text_urls()
	color_dict = self.generate_colors_list(set_urls)

	# position of the color in the input contents
	position = 0
	for pair in self.aligned_sentences:
	if "input_sentence" not in pair:
	continue
	common_phrases, position = self.compare_sentences(
	pair["input_sentence"],
	pair["matched_sentence"],
	position,
	color_dict["0"], # TODO: set color
	)


	if len(self.analyzed_table) != 0:
	html_table = self.create_table()
	else:
	html_table = ""
	return html_text, html_table

	def compare_sentences(self, sentence_1, sentence_2, position, color):
	"""
	Compares two sentences and identifies common phrases, outputting their start and end positions.

	Args:
	sentence_1: The first sentence (string).
	sentence_2: The second sentence (string).

	Returns:
	A list of dictionaries, where each dictionary represents a common phrase and contains:
	- "phrase": The common phrase (string).
	- "start_1": The starting index of the phrase in sentence_1 (int).
	- "end_1": The ending index of the phrase in sentence_1 (int).
	- "start_2": The starting index of the phrase in sentence_2 (int).
	- "end_2": The ending index of the phrase in sentence_2 (int).
	Returns an empty list if no common phrases are found. Handles edge cases like empty strings.
	"""

	if not sentence_1 or not sentence_2: # Handle empty strings
	return []

	s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
	common_phrases = []

	for block in s.get_matching_blocks():
	if block.size > 0: # Ignore zero-length matches
	start_1 = block.a
	end_1 = block.a + block.size
	start_2 = block.b
	end_2 = block.b + block.size

	phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same

	common_phrases.append({
	"phrase": phrase,
	"start_1": start_1 + position,
	"end_1": end_1 + position,
	"start_2": start_2,
	"end_2": end_2,
	"color": color,
	})
	position += len(sentence_1)
	return common_phrases, position

	def create_table(self):
	#table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
	# loop of self.analyzed_table with index:
	rows = []
	max_length = 30 # TODO: put this in configuration
	rows.append(self.format_image_row(max_length))

	for index, row in enumerate(self.analyzed_table):
	formatted_row = self.format_text_row(row, index, max_length)
	rows.append(formatted_row)
	table = "\n".join(rows)
	return f"""
	<h5>Comparison between input news and source news</h5>
	<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
	<thead>
	<tr>
	<th>Input news</th>
	<th>Source content</th>
	<th>Forensic</th>
	<th>Originality</th>
	</tr>
	</thead>
	<tbody>
	{table}
	</tbody>
	</table>

	<style>
	"""

	def format_text_row(self, row, index = 0, max_length=30):
	input_sentence = self.highlight_text(row[0], row[2]) # text, index of highlight words
	source_sentence = self.highlight_text(row[1], row[3]) # text, index of highlight words

	url = self.aligned_sentences[index]["url"] #
	short_url = self.shorten_url(url, max_length)
	source_text_url = f"""<a href="{url}">{short_url}</a>"""

	# short_url = self.shorten_url(self.text_referent_url[index], max_length)
	# source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""

	self.text_prediction_score[index]
	return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>"""

	def format_image_row(self, max_length=30):
	# input_image = f"""<img src="{self.news_image}" width="200" height="150">"""
	print(f"self.news_image = {self.news_image}")
	source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
	short_url = self.shorten_url(self.image_referent_url, max_length)
	source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
	return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""

	def shorten_url(self, url, max_length=30):
	if url is None:
	return ""

	if len(url) > max_length:
	short_url = url[:max_length] + "..."
	else:
	short_url = url
	return short_url

	def highlight_text(self, words, indexes):
	final_words = words
	for index in indexes:
	final_words[index] = (
	f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
	)
	return " ".join(final_words)