Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 6,725 Bytes

"""
Author: Khanh Phan
Date: 2024-12-04
"""

import string
from collections import Counter

import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from src.application.config import (
    CHUNK_SIZE,
    GOOGLE_API_KEY,
    GOOGLE_ENDPOINT_URL,
    NUM_CHUNKS,
    NUM_FREQUENT_WORDS,
    NUM_KEYWORDS,
    SEARCH_ENGINE_ID,
    STOPWORDS_LANG,
    TOP_SEARCH_RESUTLS,
)
from src.application.text.entity import extract_entities


def search_by_google(
    query,
    num_results=TOP_SEARCH_RESUTLS,
    is_exact_terms=False,
) -> dict:
    """
    Performs a Google Custom Search API query.

    Args:
        query (str): The search query string.
        num_results (int, optional): The number of search results to return.
            Defaults to TOP_SEARCH_RESUTLS.
        is_exact_terms (bool, optional): use an exact phrase search or not.
            Defaults to False.

    Returns:
        dict: JSON response from the Google Custom Search API,
            None if an error occurs.
    """

    params = {
        "key": GOOGLE_API_KEY,
        "cx": SEARCH_ENGINE_ID,
        "num": num_results,
    }
    if is_exact_terms:
        params["exactTerms"] = query
    else:
        params["q"] = query.replace('"', "")

    response = requests.get(GOOGLE_ENDPOINT_URL, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None


def get_most_frequent_words(
    input_text: str,
    number_word: int = NUM_FREQUENT_WORDS,
) -> str:
    """
    Extracts the most frequent words from the input text
        and forms a search phrase.

    Args:
        input_text (str): The text from which to extract frequent words.
        number_word (int, optional): The number of frequent words to extract.

    Returns:
        str: A search phrase consisting of the most frequent words.
    """
    # Check if the input text is valid
    if not isinstance(input_text, str) or not input_text:
        return None

    # Tokenize the input text into words and convert to lowercase
    words = word_tokenize(input_text.lower())

    # Get the set of stop words for the specified language
    stop_words = set(stopwords.words(STOPWORDS_LANG))

    # Get the set of punctuation characters
    punctuation = set(string.punctuation)

    # Filter out stop words, punctuation, and non-alphanumeric words
    filtered_words = [
        word
        for word in words
        if word.isalnum()
        and word not in stop_words
        and word not in punctuation
    ]

    # Count the frequency of each filtered word
    word_frequencies = Counter(filtered_words)

    # Get the most common words and their frequencies
    top_words = word_frequencies.most_common(number_word)

    for top_word in top_words:
        words.append(top_word[0])

    # Construct the search phrase
    if len(words) > NUM_FREQUENT_WORDS:
        search_phrase = " ".join(words[:NUM_FREQUENT_WORDS])
    else:
        search_phrase = " ".join(words[:number_word])

    return search_phrase


def get_chunk(
    input_text: str,
    chunk_size: int = CHUNK_SIZE,
    num_chunk: int = NUM_CHUNKS,
) -> list[str]:
    """
    Splits the input text into chunks of a specified size.

    Args:
        input_text (str): The text to be chunked.
        chunk_size (int, optional): The number of words per chunk.
        num_chunk (int, optional): The number of chunks to generate.

    Returns:
        list: A list of chunks of the input text.
    """
    if not isinstance(input_text, str):
        return []

    chunks = []
    input_words = input_text.split()  # Split by any whitespace

    for i in range(num_chunk):
        # Calculate the start and end indices for the current chunk
        start_index = i * chunk_size
        end_index = (i + 1) * chunk_size

        # Extract the words for the current chunk and join them into a string
        chunk = " ".join(input_words[start_index:end_index])
        if chunk:  # Only append non-empty chunks
            chunks.append(chunk)

    return chunks


def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]:
    """
    Extracts the top keywords from a given text using the TF-IDF method.

    Args:
        text (str): The input text from which to extract keywords.
        num_keywords (int, optional): The number of top keywords to return.

    Returns:
        list: A list of strings representing the top keywords extracted
            from the text.
    """
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG)

    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get TF-IDF scores
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Sort words by TF-IDF score
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)

    # Return top keywords
    return [word for word, score in word_scores[:num_keywords]]


def generate_search_phrases(input_text: str) -> list[str]:
    """
    Generates different types of phrases for search purposes.

    Args:
        input_text: The input text.

    Returns:
        A list containing:
        - A list of most frequent words.
        - The original input text.
        - A list of text chunks.
        - A text without entities.
    """
    if not isinstance(input_text, str):
        return []

    search_phrases = []

    # Method 1: Get most frequent words
    search_phrases.append(get_most_frequent_words(input_text))

    # Method 2: Get the whole text
    search_phrases.append(input_text)

    # Method 3: Split text by chunks
    search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes

    # Method 4: Remove identities and key words
    entities = extract_entities(input_text)
    text_without_entities = remove_identities_from_text(input_text, entities)
    search_phrases.append(text_without_entities)
    # keywords = get_keywords(input_text, 16)
    # search_phrase = " ".join(entities) + " " + " ".join(keywords)
    # search_phrases.append(search_phrase) # TODO: for demo purposes

    return search_phrases


def remove_identities_from_text(input_text: str, entities: list[str]) -> str:
    """
    Removes entities from the input text.

    Args:
        input_text: The input text as a string.
        entities: A list of entities to be removed.
    """
    for entity in entities:
        input_text = input_text.replace(entity, "")

    return input_text