Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /text /search.py

pmkhanh7890

Add comments to text module

0827f9d 3 months ago

raw

history blame

6.73 kB

	"""
	Author: Khanh Phan
	Date: 2024-12-04
	"""

	import string
	from collections import Counter

	import requests
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from sklearn.feature_extraction.text import TfidfVectorizer

	from src.application.config import (
	CHUNK_SIZE,
	GOOGLE_API_KEY,
	GOOGLE_ENDPOINT_URL,
	NUM_CHUNKS,
	NUM_FREQUENT_WORDS,
	NUM_KEYWORDS,
	SEARCH_ENGINE_ID,
	STOPWORDS_LANG,
	TOP_SEARCH_RESUTLS,
	)
	from src.application.text.entity import extract_entities


	def search_by_google(
	query,
	num_results=TOP_SEARCH_RESUTLS,
	is_exact_terms=False,
	) -> dict:
	"""
	Performs a Google Custom Search API query.

	Args:
	query (str): The search query string.
	num_results (int, optional): The number of search results to return.
	Defaults to TOP_SEARCH_RESUTLS.
	is_exact_terms (bool, optional): use an exact phrase search or not.
	Defaults to False.

	Returns:
	dict: JSON response from the Google Custom Search API,
	None if an error occurs.
	"""

	params = {
	"key": GOOGLE_API_KEY,
	"cx": SEARCH_ENGINE_ID,
	"num": num_results,
	}
	if is_exact_terms:
	params["exactTerms"] = query
	else:
	params["q"] = query.replace('"', "")

	response = requests.get(GOOGLE_ENDPOINT_URL, params=params)
	if response.status_code == 200:
	return response.json()
	else:
	print(f"Error: {response.status_code}, {response.text}")
	return None


	def get_most_frequent_words(
	input_text: str,
	number_word: int = NUM_FREQUENT_WORDS,
	) -> str:
	"""
	Extracts the most frequent words from the input text
	and forms a search phrase.

	Args:
	input_text (str): The text from which to extract frequent words.
	number_word (int, optional): The number of frequent words to extract.

	Returns:
	str: A search phrase consisting of the most frequent words.
	"""
	# Check if the input text is valid
	if not isinstance(input_text, str) or not input_text:
	return None

	# Tokenize the input text into words and convert to lowercase
	words = word_tokenize(input_text.lower())

	# Get the set of stop words for the specified language
	stop_words = set(stopwords.words(STOPWORDS_LANG))

	# Get the set of punctuation characters
	punctuation = set(string.punctuation)

	# Filter out stop words, punctuation, and non-alphanumeric words
	filtered_words = [
	word
	for word in words
	if word.isalnum()
	and word not in stop_words
	and word not in punctuation
	]

	# Count the frequency of each filtered word
	word_frequencies = Counter(filtered_words)

	# Get the most common words and their frequencies
	top_words = word_frequencies.most_common(number_word)

	for top_word in top_words:
	words.append(top_word[0])

	# Construct the search phrase
	if len(words) > NUM_FREQUENT_WORDS:
	search_phrase = " ".join(words[:NUM_FREQUENT_WORDS])
	else:
	search_phrase = " ".join(words[:number_word])

	return search_phrase


	def get_chunk(
	input_text: str,
	chunk_size: int = CHUNK_SIZE,
	num_chunk: int = NUM_CHUNKS,
	) -> list[str]:
	"""
	Splits the input text into chunks of a specified size.

	Args:
	input_text (str): The text to be chunked.
	chunk_size (int, optional): The number of words per chunk.
	num_chunk (int, optional): The number of chunks to generate.

	Returns:
	list: A list of chunks of the input text.
	"""
	if not isinstance(input_text, str):
	return []

	chunks = []
	input_words = input_text.split() # Split by any whitespace

	for i in range(num_chunk):
	# Calculate the start and end indices for the current chunk
	start_index = i * chunk_size
	end_index = (i + 1) * chunk_size

	# Extract the words for the current chunk and join them into a string
	chunk = " ".join(input_words[start_index:end_index])
	if chunk: # Only append non-empty chunks
	chunks.append(chunk)

	return chunks


	def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]:
	"""
	Extracts the top keywords from a given text using the TF-IDF method.

	Args:
	text (str): The input text from which to extract keywords.
	num_keywords (int, optional): The number of top keywords to return.

	Returns:
	list: A list of strings representing the top keywords extracted
	from the text.
	"""
	# Create a TF-IDF Vectorizer
	vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG)

	# Fit and transform the text
	tfidf_matrix = vectorizer.fit_transform([text])

	# Get feature names (words)
	feature_names = vectorizer.get_feature_names_out()

	# Get TF-IDF scores
	tfidf_scores = tfidf_matrix.toarray()[0]

	# Sort words by TF-IDF score
	word_scores = list(zip(feature_names, tfidf_scores))
	word_scores.sort(key=lambda x: x[1], reverse=True)

	# Return top keywords
	return [word for word, score in word_scores[:num_keywords]]


	def generate_search_phrases(input_text: str) -> list[str]:
	"""
	Generates different types of phrases for search purposes.

	Args:
	input_text: The input text.

	Returns:
	A list containing:
	- A list of most frequent words.
	- The original input text.
	- A list of text chunks.
	- A text without entities.
	"""
	if not isinstance(input_text, str):
	return []

	search_phrases = []

	# Method 1: Get most frequent words
	search_phrases.append(get_most_frequent_words(input_text))

	# Method 2: Get the whole text
	search_phrases.append(input_text)

	# Method 3: Split text by chunks
	search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes

	# Method 4: Remove identities and key words
	entities = extract_entities(input_text)
	text_without_entities = remove_identities_from_text(input_text, entities)
	search_phrases.append(text_without_entities)
	# keywords = get_keywords(input_text, 16)
	# search_phrase = " ".join(entities) + " " + " ".join(keywords)
	# search_phrases.append(search_phrase) # TODO: for demo purposes

	return search_phrases


	def remove_identities_from_text(input_text: str, entities: list[str]) -> str:
	"""
	Removes entities from the input text.

	Args:
	input_text: The input text as a string.
	entities: A list of entities to be removed.
	"""
	for entity in entities:
	input_text = input_text.replace(entity, "")

	return input_text