Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 7,168 Bytes

from collections import Counter
from difflib import SequenceMatcher
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams


def clean_text(text):
    """Doc cleaning"""
    punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""  # not include , and . due to number
    # Lowering text
    text = text.lower()
    
    # Removing punctuation
    text = "".join([c for c in text if c not in punctuations])
    
    # Removing whitespace and newlines
    text = re.sub(r'\s+',' ',text)
    
    text.replace("£", " * ")
    
    words = text.split()
    text = ' '.join(words[:18])  # Join the first 18 words back into a string
    
    return text

def remove_punctuation(text):
    """Remove punctuation from a given text."""
    punctuation_without_dot = string.punctuation.replace(".", "")
    translator = str.maketrans('', '', punctuation_without_dot)
    return text.translate(translator)

def get_keywords(text, num_keywords=5):
    """Return top k keywords from a doc using TF-IDF method"""
    
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get TF-IDF scores
    tfidf_scores = tfidf_matrix.toarray()[0]
    
    # Sort words by TF-IDF score
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Return top keywords
    return [word for word, score in word_scores[:num_keywords]]

"""
# Example usage
text = "Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however this definition is rejected by major AI researchers."
print(f"\n# Input text:\n'{text}'")
print("\n----------------------\n") 

keywords = get_keywords(text)
print("# Top keywords:", keywords)
print("\n----------------------\n")
"""

def get_important_sentences(paragraph: str, keywords: list[str], num_sentences: int = 3) -> list[str]:
    """
    Selects important sentences from a given paragraph based on a list of keywords.

    Args:
        paragraph (str): The input paragraph.
        keywords (list[str]): List of important keywords.
        num_sentences (int): Number of sentences to return (default is 3).

    Returns:
        list: A list of important sentences.
    """
    # Clean and split the paragraph into sentences
    sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', paragraph) if s.strip()]
    
    # Calculate the importance score for each sentence
    sentence_scores = []
    for sentence in sentences:
        processed_sentence = clean_text(sentence)
        score = 0
        words = processed_sentence.lower().split()
        word_count = Counter(words)
        
        for keyword in keywords:
            if keyword.lower() in word_count:
                score += word_count[keyword.lower()]
        
        sentence_scores.append((sentence, score))
    
    # Sort sentences by their scores in descending order
    sentence_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Return the top N sentences
    return [sentence for sentence, score in sentence_scores[:num_sentences]]

"""# Example usage
keywords = get_keywords(paragraph)
important_sentences = get_important_sentences(paragraph, keywords)

print("# Important sentences:")
for i, sentence in enumerate(important_sentences, 1):
    print(f"{i}. {sentence}")
print("\n----------------------\n") 
"""

def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length: int = 5) -> list[str]:
    """
    Extracts important phrases from a given paragraph based on a list of keywords.
    Phrase length is auto-determined, and overlapped parts are less than 20%.

    Args:
        paragraph (str): The input paragraph.
        keywords (list[str]): List of important keywords.
        phrase_length (int): The length of phrases to extract (default is 5 words).

    Returns:
        list: A list of important phrases.
    """
    # Tokenize the paragraph into words
    words = word_tokenize(paragraph.lower())
    
    # Determine phrase length (between 3 and 7 words)
    phrase_length = min(max(len(words) // 10, 5), 7)
    
    # Generate n-grams (phrases) from the paragraph
    phrases = list(ngrams(words, phrase_length))
    
    important_phrases = []
    used_indices = set()
    
    for i, phrase in enumerate(phrases):
        # Check if the phrase contains any keyword
        if any(keyword.lower() in phrase for keyword in keywords):
            # Check overlap with previously selected phrases
            if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
                important_phrases.append(clean_text(" ".join(phrase)))
                used_indices.add(i)
    
    return important_phrases

def extract_equal_text(text1, text2):
    def cleanup(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text
    
    splited_text1 = cleanup(text1).split()
    splited_text2 = cleanup(text2).split()
    
    s = SequenceMatcher(None, splited_text1, splited_text2)
    
    equal_idx_1 = []
    equal_idx_2 = []
    text1 = text1.split()
    text2 = text2.split()
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == 'equal':
            equal_idx_1.append({"start": i1, "end": i2})
            equal_idx_2.append({"start": j1, "end": j2})
            # subtext_1 = " ".join(text1[i1:i2])
            # subtext_2 = " ".join(text2[j1:j2])
            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
    return equal_idx_1, equal_idx_2

def connect_consecutive_indexes(nums):
    """
    Connects consecutive integers in a list.

    Args:
        nums: A list of integers.

    Returns:
        A list of lists, where each inner list represents a consecutive range.
    """

    if not nums:  # Handle empty input
        return []

    result = []
    start = nums[0]
    end = nums[0]

    for i in range(1, len(nums)):
        if nums[i] == end + 1:
            end = nums[i]
        else:
            result.append([start, end])
            start = nums[i]
            end = nums[i]

    result.append([start, end])  # Add the last range
    return result

"""# Example usage
keywords = get_keywords(paragraph)
important_phrases = extract_important_phrases(paragraph, keywords)

print("# Important phrases:")
for i, phrase in enumerate(important_phrases[:5], 1):  # Print top 5 phrases
    print(f"{i}. {phrase}")"""