Spaces:
Sleeping
Sleeping
from collections import Counter | |
import os | |
import string | |
import requests | |
from dotenv import load_dotenv | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from src.application.text.identity import extract_entities | |
load_dotenv() | |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") | |
def search_by_google( | |
query, | |
num_results=10, | |
is_exact_terms = False | |
) -> dict: | |
""" | |
Searches the Google Custom Search Engine for the given query. | |
Args: | |
query: The search query. | |
is_exact_terms: Whether to use exact terms search (True) or regular search (False). | |
num_results: The number of results to return (default: 10). | |
Returns: | |
A dictionary containing the search results or None if there was an error. | |
""" | |
url = "https://www.googleapis.com/customsearch/v1" | |
params = { | |
"key": GOOGLE_API_KEY, | |
"cx": SEARCH_ENGINE_ID, | |
"num": num_results, | |
} | |
if is_exact_terms: | |
params["exactTerms"] = query | |
else: | |
params["q"] = query.replace('"', "") | |
response = requests.get(url, params=params) | |
if response.status_code == 200: | |
return response.json() | |
else: | |
print(f"Error: {response.status_code}, {response.text}") | |
return None | |
def get_most_frequent_words(input_text, number_word=32): | |
""" | |
Gets the top words from the input text, excluding stop words and punctuation. | |
Args: | |
input_text: The input text as a string. | |
number_word: The number of top words to return. | |
Returns: | |
A list of tuples, where each tuple contains a word and its frequency. | |
Returns an empty list if input is not a string or is empty. | |
""" | |
if not isinstance(input_text, str) or not input_text: | |
return [] | |
words = word_tokenize(input_text.lower()) # Tokenize and lowercase | |
stop_words = set(stopwords.words('english')) | |
punctuation = set(string.punctuation) # get all punctuation | |
filtered_words = [ | |
word for word in words | |
if word.isalnum() and word not in stop_words and word not in punctuation | |
] | |
word_frequencies = Counter(filtered_words) | |
top_words = word_frequencies.most_common(number_word) | |
for top_word in top_words: | |
words.append(top_word[0]) | |
if len(words) > 32: | |
search_phrase = " ".join(words[:32]) | |
else: | |
search_phrase = " ".join(words[:number_word]) | |
return search_phrase | |
def get_chunk(input_text, chunk_length=32, num_chunk=3): | |
""" | |
Splits the input text into chunks of a specified length. | |
Args: | |
input_text: The input text as a string. | |
num_chunk: The maximum number of chunks to create. | |
chunk_length: The desired length of each chunk (in words). | |
Returns: | |
A list of string chunks. | |
Returns an empty list if input is invalid. | |
""" | |
if not isinstance(input_text, str): | |
return [] | |
chunks = [] | |
input_words = input_text.split() # Split by any whitespace | |
for i in range(num_chunk): | |
start_index = i * chunk_length | |
end_index = (i + 1) * chunk_length | |
chunk = " ".join(input_words[start_index:end_index]) | |
if chunk: # Only append non-empty chunks | |
chunks.append(chunk) | |
return chunks | |
def get_keywords(text, num_keywords=5): | |
"""Return top k keywords from a doc using TF-IDF method""" | |
# Create a TF-IDF Vectorizer | |
vectorizer = TfidfVectorizer(stop_words='english') | |
# Fit and transform the text | |
tfidf_matrix = vectorizer.fit_transform([text]) | |
# Get feature names (words) | |
feature_names = vectorizer.get_feature_names_out() | |
# Get TF-IDF scores | |
tfidf_scores = tfidf_matrix.toarray()[0] | |
# Sort words by TF-IDF score | |
word_scores = list(zip(feature_names, tfidf_scores)) | |
word_scores.sort(key=lambda x: x[1], reverse=True) | |
# Return top keywords | |
return [word for word, score in word_scores[:num_keywords]] | |
def generate_search_phrases(input_text): | |
""" | |
Generates different types of phrases for search purposes. | |
Args: | |
input_text: The input text. | |
Returns: | |
A list containing: | |
- A list of most frequent words. | |
- The original input text. | |
- A list of text chunks. | |
""" | |
if not isinstance(input_text, str): | |
return [] | |
search_phrases = [] | |
# Method 1: Get most frequent words | |
search_phrases.append(get_most_frequent_words(input_text)) | |
# Method 2: Get the whole text | |
search_phrases.append(input_text) | |
# Method 3: Split text by chunks | |
search_phrases.extend(get_chunk(input_text)) | |
# Method 4: Get most identities and key words | |
entities = extract_entities(input_text) | |
keywords = get_keywords(input_text, 16) | |
search_phrase = " ".join(entities) + " " + " ".join(keywords) | |
search_phrases.append(search_phrase) | |
return search_phrases | |