pmkhanh7890's picture
Add comments to text module
0827f9d
raw
history blame
6.73 kB
"""
Author: Khanh Phan
Date: 2024-12-04
"""
import string
from collections import Counter
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from src.application.config import (
CHUNK_SIZE,
GOOGLE_API_KEY,
GOOGLE_ENDPOINT_URL,
NUM_CHUNKS,
NUM_FREQUENT_WORDS,
NUM_KEYWORDS,
SEARCH_ENGINE_ID,
STOPWORDS_LANG,
TOP_SEARCH_RESUTLS,
)
from src.application.text.entity import extract_entities
def search_by_google(
query,
num_results=TOP_SEARCH_RESUTLS,
is_exact_terms=False,
) -> dict:
"""
Performs a Google Custom Search API query.
Args:
query (str): The search query string.
num_results (int, optional): The number of search results to return.
Defaults to TOP_SEARCH_RESUTLS.
is_exact_terms (bool, optional): use an exact phrase search or not.
Defaults to False.
Returns:
dict: JSON response from the Google Custom Search API,
None if an error occurs.
"""
params = {
"key": GOOGLE_API_KEY,
"cx": SEARCH_ENGINE_ID,
"num": num_results,
}
if is_exact_terms:
params["exactTerms"] = query
else:
params["q"] = query.replace('"', "")
response = requests.get(GOOGLE_ENDPOINT_URL, params=params)
if response.status_code == 200:
return response.json()
else:
print(f"Error: {response.status_code}, {response.text}")
return None
def get_most_frequent_words(
input_text: str,
number_word: int = NUM_FREQUENT_WORDS,
) -> str:
"""
Extracts the most frequent words from the input text
and forms a search phrase.
Args:
input_text (str): The text from which to extract frequent words.
number_word (int, optional): The number of frequent words to extract.
Returns:
str: A search phrase consisting of the most frequent words.
"""
# Check if the input text is valid
if not isinstance(input_text, str) or not input_text:
return None
# Tokenize the input text into words and convert to lowercase
words = word_tokenize(input_text.lower())
# Get the set of stop words for the specified language
stop_words = set(stopwords.words(STOPWORDS_LANG))
# Get the set of punctuation characters
punctuation = set(string.punctuation)
# Filter out stop words, punctuation, and non-alphanumeric words
filtered_words = [
word
for word in words
if word.isalnum()
and word not in stop_words
and word not in punctuation
]
# Count the frequency of each filtered word
word_frequencies = Counter(filtered_words)
# Get the most common words and their frequencies
top_words = word_frequencies.most_common(number_word)
for top_word in top_words:
words.append(top_word[0])
# Construct the search phrase
if len(words) > NUM_FREQUENT_WORDS:
search_phrase = " ".join(words[:NUM_FREQUENT_WORDS])
else:
search_phrase = " ".join(words[:number_word])
return search_phrase
def get_chunk(
input_text: str,
chunk_size: int = CHUNK_SIZE,
num_chunk: int = NUM_CHUNKS,
) -> list[str]:
"""
Splits the input text into chunks of a specified size.
Args:
input_text (str): The text to be chunked.
chunk_size (int, optional): The number of words per chunk.
num_chunk (int, optional): The number of chunks to generate.
Returns:
list: A list of chunks of the input text.
"""
if not isinstance(input_text, str):
return []
chunks = []
input_words = input_text.split() # Split by any whitespace
for i in range(num_chunk):
# Calculate the start and end indices for the current chunk
start_index = i * chunk_size
end_index = (i + 1) * chunk_size
# Extract the words for the current chunk and join them into a string
chunk = " ".join(input_words[start_index:end_index])
if chunk: # Only append non-empty chunks
chunks.append(chunk)
return chunks
def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]:
"""
Extracts the top keywords from a given text using the TF-IDF method.
Args:
text (str): The input text from which to extract keywords.
num_keywords (int, optional): The number of top keywords to return.
Returns:
list: A list of strings representing the top keywords extracted
from the text.
"""
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG)
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]
# Sort words by TF-IDF score
word_scores = list(zip(feature_names, tfidf_scores))
word_scores.sort(key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, score in word_scores[:num_keywords]]
def generate_search_phrases(input_text: str) -> list[str]:
"""
Generates different types of phrases for search purposes.
Args:
input_text: The input text.
Returns:
A list containing:
- A list of most frequent words.
- The original input text.
- A list of text chunks.
- A text without entities.
"""
if not isinstance(input_text, str):
return []
search_phrases = []
# Method 1: Get most frequent words
search_phrases.append(get_most_frequent_words(input_text))
# Method 2: Get the whole text
search_phrases.append(input_text)
# Method 3: Split text by chunks
search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
# Method 4: Remove identities and key words
entities = extract_entities(input_text)
text_without_entities = remove_identities_from_text(input_text, entities)
search_phrases.append(text_without_entities)
# keywords = get_keywords(input_text, 16)
# search_phrase = " ".join(entities) + " " + " ".join(keywords)
# search_phrases.append(search_phrase) # TODO: for demo purposes
return search_phrases
def remove_identities_from_text(input_text: str, entities: list[str]) -> str:
"""
Removes entities from the input text.
Args:
input_text: The input text as a string.
entities: A list of entities to be removed.
"""
for entity in entities:
input_text = input_text.replace(entity, "")
return input_text