Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 5,738 Bytes

import re
import string
from collections import Counter
from difflib import SequenceMatcher

from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer


def clean_text(text):
    """Doc cleaning"""
    # exclude , and . due to number
    punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""

    # Lowering text
    text = text.lower()

    # Removing punctuation
    text = "".join([c for c in text if c not in punctuations])

    # Removing whitespace and newlines
    text = re.sub(r"\s+", " ", text)

    text.replace("£", " * ")

    words = text.split()
    text = " ".join(words[:18])  # Join the first 18 words back into a string

    return text


def remove_punctuation(text):
    """Remove punctuation from a given text."""
    punctuation_without_dot = string.punctuation.replace(".", "")
    translator = str.maketrans("", "", punctuation_without_dot)
    return text.translate(translator)


def get_keywords(text, num_keywords=5):
    """Return top k keywords from a doc using TF-IDF method"""

    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english")

    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get TF-IDF scores
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Sort words by TF-IDF score
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)

    # Return top keywords
    return [word for word, score in word_scores[:num_keywords]]


def get_important_sentences(
    paragraph: str,
    keywords: list[str],
    num_sentences: int = 3,
) -> list[str]:
    """
    Selects important sentences based on a list of keywords.

    Args:
        paragraph (str): The input paragraph.
        keywords (list[str]): List of important keywords.
        num_sentences (int): Number of sentences to return (default is 3).

    Returns:
        list: A list of important sentences.
    """
    # Clean and split the paragraph into sentences
    sentences = [
        s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
    ]

    # Calculate the importance score for each sentence
    sentence_scores = []
    for sentence in sentences:
        processed_sentence = clean_text(sentence)
        score = 0
        words = processed_sentence.lower().split()
        word_count = Counter(words)

        for keyword in keywords:
            if keyword.lower() in word_count:
                score += word_count[keyword.lower()]

        sentence_scores.append((sentence, score))

    # Sort sentences by their scores in descending order
    sentence_scores.sort(key=lambda x: x[1], reverse=True)

    # Return the top N sentences
    return [sentence for sentence, score in sentence_scores[:num_sentences]]


def extract_important_phrases(
    paragraph: str,
    keywords: list[str],
    phrase_length: int = 5,
) -> list[str]:
    """
    Extracts important phrases based on a list of keywords.
    Phrase length is auto-determined, and overlapped parts are less than 20%.

    Args:
        paragraph (str): The input paragraph.
        keywords (list[str]): List of important keywords.
        phrase_length (int): Length of phrases to extract (default: 5 words).

    Returns:
        list: A list of important phrases.
    """
    # Tokenize the paragraph into words
    words = word_tokenize(paragraph.lower())

    # Determine phrase length (between 3 and 7 words)
    phrase_length = min(max(len(words) // 10, 5), 7)

    # Generate n-grams (phrases) from the paragraph
    phrases = list(ngrams(words, phrase_length))

    important_phrases = []
    used_indices = set()

    for i, phrase in enumerate(phrases):
        # Check if the phrase contains any keyword
        if any(keyword.lower() in phrase for keyword in keywords):
            # Check overlap with previously selected phrases
            if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
                important_phrases.append(clean_text(" ".join(phrase)))
                used_indices.add(i)

    return important_phrases


def extract_equal_text(text1, text2):
    def cleanup(text):
        text = text.lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        return text

    splited_text1 = cleanup(text1).split()
    splited_text2 = cleanup(text2).split()

    s = SequenceMatcher(None, splited_text1, splited_text2)

    equal_idx_1 = []
    equal_idx_2 = []
    text1 = text1.split()
    text2 = text2.split()
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == "equal":
            equal_idx_1.append({"start": i1, "end": i2})
            equal_idx_2.append({"start": j1, "end": j2})
            # subtext_1 = " ".join(text1[i1:i2])
            # subtext_2 = " ".join(text2[j1:j2])
            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}]
            # {subtext_1!r:>55} --> {subtext_2!r}')
    return equal_idx_1, equal_idx_2


def connect_consecutive_indexes(nums):
    """
    Connects consecutive integers in a list.

    Args:
        nums: A list of integers.

    Returns:
        A list of lists, where each inner list represents a consecutive range.
    """

    if not nums:  # Handle empty input
        return []

    result = []
    start = nums[0]
    end = nums[0]

    for i in range(1, len(nums)):
        if nums[i] == end + 1:
            end = nums[i]
        else:
            result.append([start, end])
            start = nums[i]
            end = nums[i]

    result.append([start, end])  # Add the last range
    return result