pmkhanh7890's picture
add requirements
0542c93
raw
history blame
5.06 kB
from collections import Counter
import os
import string
import requests
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from src.application.text.identity import extract_entities
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
def search_by_google(
query,
num_results=10,
is_exact_terms = False
) -> dict:
"""
Searches the Google Custom Search Engine for the given query.
Args:
query: The search query.
is_exact_terms: Whether to use exact terms search (True) or regular search (False).
num_results: The number of results to return (default: 10).
Returns:
A dictionary containing the search results or None if there was an error.
"""
url = "https://www.googleapis.com/customsearch/v1"
params = {
"key": GOOGLE_API_KEY,
"cx": SEARCH_ENGINE_ID,
"num": num_results,
}
if is_exact_terms:
params["exactTerms"] = query
else:
params["q"] = query.replace('"', "")
response = requests.get(url, params=params)
if response.status_code == 200:
return response.json()
else:
print(f"Error: {response.status_code}, {response.text}")
return None
def get_most_frequent_words(input_text, number_word=32):
"""
Gets the top words from the input text, excluding stop words and punctuation.
Args:
input_text: The input text as a string.
number_word: The number of top words to return.
Returns:
A list of tuples, where each tuple contains a word and its frequency.
Returns an empty list if input is not a string or is empty.
"""
if not isinstance(input_text, str) or not input_text:
return []
words = word_tokenize(input_text.lower()) # Tokenize and lowercase
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation) # get all punctuation
filtered_words = [
word for word in words
if word.isalnum() and word not in stop_words and word not in punctuation
]
word_frequencies = Counter(filtered_words)
top_words = word_frequencies.most_common(number_word)
for top_word in top_words:
words.append(top_word[0])
if len(words) > 32:
search_phrase = " ".join(words[:32])
else:
search_phrase = " ".join(words[:number_word])
return search_phrase
def get_chunk(input_text, chunk_length=32, num_chunk=3):
"""
Splits the input text into chunks of a specified length.
Args:
input_text: The input text as a string.
num_chunk: The maximum number of chunks to create.
chunk_length: The desired length of each chunk (in words).
Returns:
A list of string chunks.
Returns an empty list if input is invalid.
"""
if not isinstance(input_text, str):
return []
chunks = []
input_words = input_text.split() # Split by any whitespace
for i in range(num_chunk):
start_index = i * chunk_length
end_index = (i + 1) * chunk_length
chunk = " ".join(input_words[start_index:end_index])
if chunk: # Only append non-empty chunks
chunks.append(chunk)
return chunks
def get_keywords(text, num_keywords=5):
"""Return top k keywords from a doc using TF-IDF method"""
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]
# Sort words by TF-IDF score
word_scores = list(zip(feature_names, tfidf_scores))
word_scores.sort(key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, score in word_scores[:num_keywords]]
def generate_search_phrases(input_text):
"""
Generates different types of phrases for search purposes.
Args:
input_text: The input text.
Returns:
A list containing:
- A list of most frequent words.
- The original input text.
- A list of text chunks.
"""
if not isinstance(input_text, str):
return []
search_phrases = []
# Method 1: Get most frequent words
search_phrases.append(get_most_frequent_words(input_text))
# Method 2: Get the whole text
search_phrases.append(input_text)
# Method 3: Split text by chunks
search_phrases.extend(get_chunk(input_text))
# Method 4: Get most identities and key words
entities = extract_entities(input_text)
keywords = get_keywords(input_text, 16)
search_phrase = " ".join(entities) + " " + " ".join(keywords)
search_phrases.append(search_phrase)
return search_phrases