""" Author: Khanh Phan Date: 2024-12-04 """ import re import string from collections import Counter from difflib import SequenceMatcher from nltk.tokenize import word_tokenize from nltk.util import ngrams from sklearn.feature_extraction.text import TfidfVectorizer def clean_text(text: str) -> str: """ Cleans and preprocesses a given text string. Args: text (str): The input text to be cleaned. Returns: str: The cleaned and preprocessed text, containing the first 18 words. """ # Define a set of punctuation characters to exclude, # exclude comma and period due to numbers punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~""" # Lowering text text = text.lower() # Removing punctuation text = "".join([c for c in text if c not in punctuations]) # Removing whitespace and newlines text = re.sub(r"\s+", " ", text) # Replace £ with * because Google search doesn't recognize £ text.replace("£", " * ") # Split the text into a list of words. words = text.split() # Join the first 18 words back into a string text = " ".join(words[:18]) # TODO: consider another number return text def remove_punctuation(text: str) -> str: """ Removes all punctuation characters from a string, except for periods (.). Args: text (str): The input string. Returns: str: The string with all punctuation characters removed, except for periods. """ # Create a string containing all punctuation characters, # except for periods. punctuation_without_dot = string.punctuation.replace(".", "") # Create a translation table to remove the specified punctuation chars. translator = str.maketrans("", "", punctuation_without_dot) # Apply the translation table to the input text and return the result. return text.translate(translator) def get_keywords(text, num_keywords=5): """ Extracts the top k keywords from a document using the TF-IDF method. Args: text (str): The input text from which to extract keywords. num_keywords (int, optional): The number of top keywords to return. Returns: list: A list of the top keywords extracted from the text. """ # Create a TF-IDF Vectorizer vectorizer = TfidfVectorizer(stop_words="english") # Fit and transform the text tfidf_matrix = vectorizer.fit_transform([text]) # Get feature names (words) feature_names = vectorizer.get_feature_names_out() # Get TF-IDF scores tfidf_scores = tfidf_matrix.toarray()[0] # Sort words by TF-IDF score word_scores = list(zip(feature_names, tfidf_scores)) word_scores.sort(key=lambda x: x[1], reverse=True) # Return top keywords return [word for word, score in word_scores[:num_keywords]] def get_important_sentences( sentence: str, keywords: list[str], num_sentences: int = 3, ) -> list[str]: """ Selects important sentences based on a list of keywords. Args: sentence (str): The input sentence. keywords (list[str]): List of important keywords. num_sentences (int): Number of sentences to return (default is 3). Returns: list: A list of important sentences. """ # Clean and split the sentence into sentences sentences = [ s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip() ] # Calculate the importance score for each sentence sentence_scores = [] for sentence in sentences: processed_sentence = clean_text(sentence) score = 0 words = processed_sentence.lower().split() word_count = Counter(words) for keyword in keywords: if keyword.lower() in word_count: score += word_count[keyword.lower()] sentence_scores.append((sentence, score)) # Sort sentences by their scores in descending order sentence_scores.sort(key=lambda x: x[1], reverse=True) # Return the top N sentences return [sentence for sentence, score in sentence_scores[:num_sentences]] def extract_important_phrases( text: str, keywords: list[str], phrase_length: int = 5, ) -> list[str]: """ Extracts important phrases based on a list of keywords. Phrase length is auto-determined, and overlapped parts are less than 20%. Args: text (str): The input text. keywords (list[str]): List of important keywords. phrase_length (int): Length of phrases to extract (default: 5 words). Returns: list: A list of important phrases. """ # Tokenize the text into words words = word_tokenize(text.lower()) # Determine phrase length (between 3 and 7 words) phrase_length = min(max(len(words) // 10, 5), 7) # Generate n-grams (phrases) from the text phrases = list(ngrams(words, phrase_length)) important_phrases = [] used_indices = set() for i, phrase in enumerate(phrases): # Check if the phrase contains any keyword if any(keyword.lower() in phrase for keyword in keywords): # Check overlap with previously selected phrases if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices): important_phrases.append(clean_text(" ".join(phrase))) used_indices.add(i) return important_phrases def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]: """ Extracts the indices of equal text segments between two strings. Args: text1 (str): The first input string. text2 (str): The second input string. Returns: tuple[ list[dict{"start": int, "end": int}], list[dict{"start": int, "end": int}] ] - list: the start and end indices of equal segments in text1. - list: the start and end indices of equal segments in text2. """ def cleanup(text: str) -> str: """ Cleans up a text string by converting to lowercase and removing punctuation. Args: text (str): The input text. Returns: str: The cleaned text. """ text = text.lower() text = text.translate(str.maketrans("", "", string.punctuation)) return text # Clean and split the input texts into lists of words. splited_text1 = cleanup(text1).split() splited_text2 = cleanup(text2).split() # Create a SequenceMatcher object to compare the cleaned word lists. s = SequenceMatcher(None, splited_text1, splited_text2) equal_idx_1 = [] equal_idx_2 = [] # Split the original texts into lists of words (without cleaning). text1 = text1.split() text2 = text2.split() for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == "equal": # Append the start and end indices of the equal segment # to the respective lists. equal_idx_1.append({"start": i1, "end": i2}) equal_idx_2.append({"start": j1, "end": j2}) # subtext_1 = " ".join(text1[i1:i2]) # subtext_2 = " ".join(text2[j1:j2]) # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] ' # f'{subtext_1!r:>55} --> {subtext_2!r}') return equal_idx_1, equal_idx_2 def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]: """ Connects consecutive integers in a list. Args: nums (list): A list of integers. Returns: list: A list of lists, where each inner list represents a consecutive range. For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]]. """ if not nums: # Handle empty input return [] result = [] start = nums[0] end = nums[0] for i in range(1, len(nums)): # Check if the current number is consecutive to the previous end. if nums[i] == end + 1: end = nums[i] # Extend the current range. else: # Add the current range to the result and start a new range. result.append([start, end]) start = nums[i] end = nums[i] # Add the last range to the result. result.append([start, end]) return result