Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /search_detection.py

pmkhanh7890

run pre-commit

38fd181 3 months ago

raw

history blame

15.1 kB

	import string
	import warnings
	from difflib import SequenceMatcher

	import nltk
	import numpy as np
	import torch
	from sentence_transformers import (
	SentenceTransformer,
	util,
	)

	from src.application.text.helper import extract_equal_text
	from src.application.text.preprocessing import split_into_paragraphs
	from src.application.text.search import (
	generate_search_phrases,
	search_by_google,
	)
	from src.application.url_reader import URLReader

	warnings.simplefilter(action="ignore", category=FutureWarning)

	# Download necessary NLTK data files
	nltk.download("punkt", quiet=True)
	nltk.download("punkt_tab", quiet=True)
	nltk.download("stopwords", quiet=True)

	# load the model
	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
	PARAPHASE_MODEL.to(DEVICE)

	BATCH_SIZE = 8

	PARAPHRASE_THRESHOLD = 0.8
	PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7
	MIN_SAME_SENTENCE_LEN = 6
	MIN_PHRASE_SENTENCE_LEN = 10
	MIN_RATIO_PARAPHRASE_NUM = 0.7
	MAX_CHAR_SIZE = 30000


	def detect_text_by_relative_search(
	input_text,
	index,
	is_support_opposite=False,
	):
	checked_urls = set()
	searched_phrases = generate_search_phrases(input_text[index])

	for candidate in searched_phrases:
	search_results = search_by_google(candidate)
	urls = [item["link"] for item in search_results.get("items", [])]

	for url in urls[:3]:
	if url in checked_urls: # visited url
	continue
	if "bbc.com" not in url:
	continue

	checked_urls.add(url)
	print(f"\t\tChecking URL: {url}")

	content = URLReader(url)

	if content.is_extracted is True:
	if content.title is None or content.text is None:
	print("\t\t\t↑↑↑ Title or text not found")
	continue

	page_text = content.title + "\n" + content.text
	if len(page_text) > MAX_CHAR_SIZE:
	print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
	continue
	print(f"\t\t\t↑↑↑ Title: {content.title}")
	paraphrase, aligned_first_sentences = check_paraphrase(
	input_text[index],
	page_text,
	url,
	)

	if paraphrase is False:
	return (
	paraphrase,
	url,
	aligned_first_sentences,
	content.images,
	index,
	)

	sub_paraphrase = True
	while sub_paraphrase is True:
	index += 1
	print(f"----search {index} < {len(input_text)}----")
	if index >= len(input_text):
	print(f"input_text_last: {input_text[-1]}")
	break
	print(f"input_text: {input_text[index]}")
	sub_paraphrase, sub_sentences = check_paraphrase(
	input_text[index],
	page_text,
	url,
	)
	print(f"sub_paraphrase: {sub_paraphrase}")
	print(f"sub_sentences: {sub_sentences}")
	if sub_paraphrase is True:
	aligned_first_sentences["input_sentence"] += (
	"<br>" + sub_sentences["input_sentence"]
	)
	aligned_first_sentences["matched_sentence"] += (
	"<br>" + sub_sentences["matched_sentence"]
	)
	aligned_first_sentences["similarity"] += sub_sentences[
	"similarity"
	]
	aligned_first_sentences["similarity"] /= 2

	print(f"paraphrase: {paraphrase}")
	print(f"aligned_first_sentences: {aligned_first_sentences}")
	return (
	paraphrase,
	url,
	aligned_first_sentences,
	content.images,
	index,
	)

	return False, None, [], [], index


	def find_text_source(text, text_index, sentences_df):
	sentence = {
	"input_sentence": text[text_index],
	"matched_sentence": "",
	"label": "",
	"similarity": None,
	"paraphrase": None,
	"url": "",
	"group": None,
	}
	checked_urls = set()
	searched_phrases = generate_search_phrases(text[text_index])

	for candidate in searched_phrases:
	search_results = search_by_google(candidate)
	urls = [item["link"] for item in search_results.get("items", [])]

	for url in urls[:3]:
	if url in checked_urls: # visited url
	continue
	if "bbc.com" not in url:
	continue

	checked_urls.add(url)
	print(f"\t\tChecking URL: {url}")

	content = URLReader(url)

	if content.is_extracted is True:
	if content.title is None or content.text is None:
	print("\t\t\t↑↑↑ Title or text not found")
	continue

	page_text = content.title + "\n" + content.text
	if len(page_text) > MAX_CHAR_SIZE:
	print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
	continue
	print(f"\t\t\t↑↑↑ Title: {content.title}")
	paraphrase, aligned_sentence = check_paraphrase(
	text,
	page_text,
	url,
	)

	# add one more key "group" into aligned_sentence
	sentences_df.loc[text_index, "input_sentence"] = (
	aligned_sentence["input_sentence"]
	)
	sentences_df.loc[text_index, "matched_sentence"] = (
	aligned_sentence["matched_sentence"]
	)
	sentences_df.loc[text_index, "label"] = aligned_sentence[
	"label"
	]
	sentences_df.loc[text_index, "similarity"] = aligned_sentence[
	"similarity"
	]
	sentences_df.loc[text_index, "url"] = aligned_sentence["url"]

	if aligned_sentence["paraphrase"] is False:
	return paraphrase, sentences_df

	for text_index, _ in enumerate(sentences_df):
	if sentences_df[text_index]["url"] is not None:
	continue

	# find content in new url
	_, aligned_sentence = check_paraphrase(
	text[text_index],
	page_text,
	url,
	)

	if aligned_sentence["url"] is not None:
	continue

	sentences_df.loc[text_index, "input_sentence"] = (
	aligned_sentence["input_sentence"]
	)
	sentences_df.loc[text_index, "matched_sentence"] = (
	aligned_sentence["matched_sentence"]
	)
	sentences_df.loc[text_index, "label"] = aligned_sentence[
	"label"
	]
	sentences_df.loc[text_index, "similarity"] = (
	aligned_sentence["similarity"]
	)
	sentences_df.loc[text_index, "url"] = aligned_sentence[
	"url"
	]

	return sentences_df, content.images

	return sentence, []


	def longest_common_subsequence(arr1, arr2):
	"""
	Finds the length of the longest common subsequence (contiguous) between
	two arrays.

	Args:
	arr1: The first array.
	arr2: The second array.

	Returns:
	The length of the longest common subsequence.
	Returns 0 if either input is invalid.
	"""

	if not isinstance(arr1, list) or not isinstance(arr2, list):
	return 0

	n = len(arr1)
	m = len(arr2)

	if n == 0 or m == 0: # handle empty list
	return 0

	# Create table dp with size (n+1) x (m+1)
	dp = [[0] * (m + 1) for _ in range(n + 1)]
	max_length = 0

	for i in range(1, n + 1):
	for j in range(1, m + 1):
	if arr1[i - 1] == arr2[j - 1]:
	dp[i][j] = dp[i - 1][j - 1] + 1
	max_length = max(max_length, dp[i][j])
	else:
	dp[i][j] = 0 # set 0 since the array must be consecutive

	return max_length


	def check_sentence(
	input_sentence,
	source_sentence,
	min_same_sentence_len,
	min_phrase_sentence_len,
	verbose=False,
	):
	"""
	Checks if two sentences are similar based on exact match or
	longest common subsequence.

	Args:
	input_sentence: The input sentence.
	source_sentence: The source sentence.
	min_same_sentence_len: Minimum length for exact sentence match.
	min_phrase_sentence_len: Minimum length for common subsequence match.
	verbose: If True, print debug information.

	Returns:
	True if the sentences are considered similar, False otherwise.
	Returns False if input is not valid.
	"""

	if not isinstance(input_sentence, str) or not isinstance(
	source_sentence,
	str,
	):
	return False

	input_sentence = input_sentence.strip()
	source_sentence = source_sentence.strip()

	if not input_sentence or not source_sentence: # handle empty string
	return False

	input_words = input_sentence.split() # split without arguments
	source_words = source_sentence.split() # split without arguments

	if (
	input_sentence == source_sentence
	and len(input_words) >= min_same_sentence_len
	):
	if verbose:
	print("Exact match found.")
	return True

	max_overlap_len = longest_common_subsequence(input_words, source_words)
	if verbose:
	print(f"Max overlap length: {max_overlap_len}") # print overlap length
	if max_overlap_len >= min_phrase_sentence_len:
	return True

	return False


	def check_paraphrase(input_text, page_text, url):
	"""
	Checks if the input text is paraphrased in the content at the given URL.

	Args:
	input_text: The text to check for paraphrase.
	page_text: The text of the web page to compare with.
	url

	Returns:
	A tuple containing:

	"""
	is_paraphrase_text = False

	if not isinstance(input_text, str) or not isinstance(page_text, str):
	return False, []

	# Extract sentences from input text and web page
	# input_sentences = split_into_paragraphs(input_text)
	input_sentences = [input_text]

	if not page_text:
	return is_paraphrase_text, []

	page_sentences = split_into_paragraphs(page_text)
	if not input_sentences or not page_sentences:
	return is_paraphrase_text, []

	additional_sentences = []
	for sentence in page_sentences:
	if ", external" in sentence:
	additional_sentences.append(sentence.replace(", external", ""))
	page_sentences.extend(additional_sentences)

	# Encode sentences into embeddings
	embeddings1 = PARAPHASE_MODEL.encode(
	input_sentences,
	convert_to_tensor=True,
	device=DEVICE,
	)
	embeddings2 = PARAPHASE_MODEL.encode(
	page_sentences,
	convert_to_tensor=True,
	device=DEVICE,
	)

	# Compute cosine similarity matrix
	similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()

	# Find sentence alignments
	alignment = {}
	paraphrased_sentence_count = 0
	best_matched_sentence = ""
	for i, sentence1 in enumerate(input_sentences):
	max_sim_index = np.argmax(similarity_matrix[i])
	max_similarity = similarity_matrix[i][max_sim_index]

	best_matched_sentence = page_sentences[max_sim_index]
	is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD

	if is_paraphrase_sentence is False:
	alignment = {
	"input_sentence": sentence1,
	"matched_sentence": "",
	"similarity": max_similarity,
	"label": "",
	"paraphrase": is_paraphrase_sentence,
	"url": "",
	}
	else:
	alignment = {
	"input_sentence": sentence1,
	"matched_sentence": page_sentences[max_sim_index],
	"similarity": max_similarity,
	"label": "",
	"paraphrase": is_paraphrase_sentence,
	"url": url,
	}

	# Check for individual sentence paraphrase
	# if overall paraphrase not yet found
	if not is_paraphrase_text and check_sentence(
	sentence1,
	page_sentences[max_sim_index],
	MIN_SAME_SENTENCE_LEN,
	MIN_PHRASE_SENTENCE_LEN,
	):
	is_paraphrase_text = True

	# alignment.append(item)
	paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0

	# Check if enough sentences are paraphrases

	is_paraphrase_text = (
	paraphrased_sentence_count > 0
	) # min_matching_sentences

	# Method 2: Check if overlapped words between sentences are more than 50%
	equal_idx_1, _ = extract_equal_text(
	input_sentences[0],
	best_matched_sentence,
	)
	matched_count = 0
	for index in equal_idx_1:
	matched_count += index["end"] - index["start"]
	sent = input_sentences[0].translate(
	str.maketrans("", "", string.punctuation),
	)
	num_words = len(sent.split())
	if matched_count > num_words / 2:
	is_paraphrase_text = True

	return is_paraphrase_text, alignment


	def similarity_ratio(a, b):
	"""
	Calculates the similarity ratio between two strings using SequenceMatcher.

	Args:
	a: The first string.
	b: The second string.

	Returns:
	A float representing the similarity ratio between 0.0 and 1.0.
	Returns 0.0 if either input is None or not a string.
	"""
	if (
	not isinstance(a, str)
	or not isinstance(b, str)
	or a is None
	or b is None
	):
	return 0.0 # Handle cases where inputs are not strings or None
	return SequenceMatcher(None, a, b).ratio()


	def check_human(alligned_sentences):
	"""
	Checks if a sufficient number of input sentences are found within
	source sentences.

	Returns:
	bool: True if the condition is met, False otherwise.
	"""
	if not alligned_sentences: # Handle empty data case
	return False

	if alligned_sentences["similarity"] >= 0.99:
	return True
	return False


	if __name__ == "__main__":
	pass