Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 5,058 Bytes

from collections import Counter
import os
import string
import requests
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from src.application.text.identity import extract_entities

load_dotenv() 
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

def search_by_google(
    query, 
    num_results=10,
    is_exact_terms = False
    ) -> dict:
    """
    Searches the Google Custom Search Engine for the given query.

    Args:
        query: The search query.
        is_exact_terms: Whether to use exact terms search (True) or regular search (False).
        num_results: The number of results to return (default: 10).

    Returns:
        A dictionary containing the search results or None if there was an error.
    """
    
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "num": num_results,
    }
    if is_exact_terms:
        params["exactTerms"] = query
    else:
        params["q"] = query.replace('"', "")
    
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

def get_most_frequent_words(input_text, number_word=32):
    """
    Gets the top words from the input text, excluding stop words and punctuation.

    Args:
        input_text: The input text as a string.
        number_word: The number of top words to return.

    Returns:
        A list of tuples, where each tuple contains a word and its frequency.
        Returns an empty list if input is not a string or is empty.
    """
    if not isinstance(input_text, str) or not input_text:
        return []

    words = word_tokenize(input_text.lower())  # Tokenize and lowercase

    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation) # get all punctuation
    filtered_words = [
        word for word in words
        if word.isalnum() and word not in stop_words and word not in punctuation
    ]
    word_frequencies = Counter(filtered_words)
    top_words = word_frequencies.most_common(number_word)
    
    for top_word in top_words:
        words.append(top_word[0])
    
    if len(words) > 32:
        search_phrase = " ".join(words[:32])
    else:
        search_phrase = " ".join(words[:number_word])

    return search_phrase

def get_chunk(input_text, chunk_length=32, num_chunk=3):
    """
    Splits the input text into chunks of a specified length.

    Args:
        input_text: The input text as a string.
        num_chunk: The maximum number of chunks to create.
        chunk_length: The desired length of each chunk (in words).

    Returns:
        A list of string chunks. 
        Returns an empty list if input is invalid.
    """
    if not isinstance(input_text, str):
        return []

    chunks = []
    input_words = input_text.split()  # Split by any whitespace

    for i in range(num_chunk):
        start_index = i * chunk_length
        end_index = (i + 1) * chunk_length
        chunk = " ".join(input_words[start_index:end_index])
        if chunk:  # Only append non-empty chunks
            chunks.append(chunk)

    return chunks

def get_keywords(text, num_keywords=5):
    """Return top k keywords from a doc using TF-IDF method"""
    
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get TF-IDF scores
    tfidf_scores = tfidf_matrix.toarray()[0]
    
    # Sort words by TF-IDF score
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Return top keywords
    return [word for word, score in word_scores[:num_keywords]]


def generate_search_phrases(input_text):
    """
    Generates different types of phrases for search purposes.

    Args:
        input_text: The input text.

    Returns:
        A list containing:
        - A list of most frequent words.
        - The original input text.
        - A list of text chunks.
    """
    if not isinstance(input_text, str):
        return []
    
    search_phrases = []
    
    # Method 1: Get most frequent words
    search_phrases.append(get_most_frequent_words(input_text))
    
    # Method 2: Get the whole text
    search_phrases.append(input_text)
    
    # Method 3: Split text by chunks
    search_phrases.extend(get_chunk(input_text))
    
    # Method 4: Get most identities and key words
    entities = extract_entities(input_text)
    keywords = get_keywords(input_text, 16)
    search_phrase = " ".join(entities) + " " + " ".join(keywords)
    search_phrases.append(search_phrase)
    
    return search_phrases