Spaces:
Sleeping
Sleeping
import re | |
import string | |
from collections import Counter | |
from difflib import SequenceMatcher | |
from nltk.tokenize import word_tokenize | |
from nltk.util import ngrams | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def clean_text(text): | |
"""Doc cleaning""" | |
# exclude , and . due to number | |
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~""" | |
# Lowering text | |
text = text.lower() | |
# Removing punctuation | |
text = "".join([c for c in text if c not in punctuations]) | |
# Removing whitespace and newlines | |
text = re.sub(r"\s+", " ", text) | |
text.replace("£", " * ") | |
words = text.split() | |
text = " ".join(words[:18]) # Join the first 18 words back into a string | |
return text | |
def remove_punctuation(text): | |
"""Remove punctuation from a given text.""" | |
punctuation_without_dot = string.punctuation.replace(".", "") | |
translator = str.maketrans("", "", punctuation_without_dot) | |
return text.translate(translator) | |
def get_keywords(text, num_keywords=5): | |
"""Return top k keywords from a doc using TF-IDF method""" | |
# Create a TF-IDF Vectorizer | |
vectorizer = TfidfVectorizer(stop_words="english") | |
# Fit and transform the text | |
tfidf_matrix = vectorizer.fit_transform([text]) | |
# Get feature names (words) | |
feature_names = vectorizer.get_feature_names_out() | |
# Get TF-IDF scores | |
tfidf_scores = tfidf_matrix.toarray()[0] | |
# Sort words by TF-IDF score | |
word_scores = list(zip(feature_names, tfidf_scores)) | |
word_scores.sort(key=lambda x: x[1], reverse=True) | |
# Return top keywords | |
return [word for word, score in word_scores[:num_keywords]] | |
def get_important_sentences( | |
paragraph: str, | |
keywords: list[str], | |
num_sentences: int = 3, | |
) -> list[str]: | |
""" | |
Selects important sentences based on a list of keywords. | |
Args: | |
paragraph (str): The input paragraph. | |
keywords (list[str]): List of important keywords. | |
num_sentences (int): Number of sentences to return (default is 3). | |
Returns: | |
list: A list of important sentences. | |
""" | |
# Clean and split the paragraph into sentences | |
sentences = [ | |
s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip() | |
] | |
# Calculate the importance score for each sentence | |
sentence_scores = [] | |
for sentence in sentences: | |
processed_sentence = clean_text(sentence) | |
score = 0 | |
words = processed_sentence.lower().split() | |
word_count = Counter(words) | |
for keyword in keywords: | |
if keyword.lower() in word_count: | |
score += word_count[keyword.lower()] | |
sentence_scores.append((sentence, score)) | |
# Sort sentences by their scores in descending order | |
sentence_scores.sort(key=lambda x: x[1], reverse=True) | |
# Return the top N sentences | |
return [sentence for sentence, score in sentence_scores[:num_sentences]] | |
def extract_important_phrases( | |
paragraph: str, | |
keywords: list[str], | |
phrase_length: int = 5, | |
) -> list[str]: | |
""" | |
Extracts important phrases based on a list of keywords. | |
Phrase length is auto-determined, and overlapped parts are less than 20%. | |
Args: | |
paragraph (str): The input paragraph. | |
keywords (list[str]): List of important keywords. | |
phrase_length (int): Length of phrases to extract (default: 5 words). | |
Returns: | |
list: A list of important phrases. | |
""" | |
# Tokenize the paragraph into words | |
words = word_tokenize(paragraph.lower()) | |
# Determine phrase length (between 3 and 7 words) | |
phrase_length = min(max(len(words) // 10, 5), 7) | |
# Generate n-grams (phrases) from the paragraph | |
phrases = list(ngrams(words, phrase_length)) | |
important_phrases = [] | |
used_indices = set() | |
for i, phrase in enumerate(phrases): | |
# Check if the phrase contains any keyword | |
if any(keyword.lower() in phrase for keyword in keywords): | |
# Check overlap with previously selected phrases | |
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices): | |
important_phrases.append(clean_text(" ".join(phrase))) | |
used_indices.add(i) | |
return important_phrases | |
def extract_equal_text(text1, text2): | |
def cleanup(text): | |
text = text.lower() | |
text = text.translate(str.maketrans("", "", string.punctuation)) | |
return text | |
splited_text1 = cleanup(text1).split() | |
splited_text2 = cleanup(text2).split() | |
s = SequenceMatcher(None, splited_text1, splited_text2) | |
equal_idx_1 = [] | |
equal_idx_2 = [] | |
text1 = text1.split() | |
text2 = text2.split() | |
for tag, i1, i2, j1, j2 in s.get_opcodes(): | |
if tag == "equal": | |
equal_idx_1.append({"start": i1, "end": i2}) | |
equal_idx_2.append({"start": j1, "end": j2}) | |
# subtext_1 = " ".join(text1[i1:i2]) | |
# subtext_2 = " ".join(text2[j1:j2]) | |
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] | |
# {subtext_1!r:>55} --> {subtext_2!r}') | |
return equal_idx_1, equal_idx_2 | |
def connect_consecutive_indexes(nums): | |
""" | |
Connects consecutive integers in a list. | |
Args: | |
nums: A list of integers. | |
Returns: | |
A list of lists, where each inner list represents a consecutive range. | |
""" | |
if not nums: # Handle empty input | |
return [] | |
result = [] | |
start = nums[0] | |
end = nums[0] | |
for i in range(1, len(nums)): | |
if nums[i] == end + 1: | |
end = nums[i] | |
else: | |
result.append([start, end]) | |
start = nums[i] | |
end = nums[i] | |
result.append([start, end]) # Add the last range | |
return result | |