pmkhanh7890's picture
pre-commit
bfe6692
raw
history blame
5.74 kB
import re
import string
from collections import Counter
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
def clean_text(text):
"""Doc cleaning"""
# exclude , and . due to number
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
# Lowering text
text = text.lower()
# Removing punctuation
text = "".join([c for c in text if c not in punctuations])
# Removing whitespace and newlines
text = re.sub(r"\s+", " ", text)
text.replace("£", " * ")
words = text.split()
text = " ".join(words[:18]) # Join the first 18 words back into a string
return text
def remove_punctuation(text):
"""Remove punctuation from a given text."""
punctuation_without_dot = string.punctuation.replace(".", "")
translator = str.maketrans("", "", punctuation_without_dot)
return text.translate(translator)
def get_keywords(text, num_keywords=5):
"""Return top k keywords from a doc using TF-IDF method"""
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]
# Sort words by TF-IDF score
word_scores = list(zip(feature_names, tfidf_scores))
word_scores.sort(key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, score in word_scores[:num_keywords]]
def get_important_sentences(
paragraph: str,
keywords: list[str],
num_sentences: int = 3,
) -> list[str]:
"""
Selects important sentences based on a list of keywords.
Args:
paragraph (str): The input paragraph.
keywords (list[str]): List of important keywords.
num_sentences (int): Number of sentences to return (default is 3).
Returns:
list: A list of important sentences.
"""
# Clean and split the paragraph into sentences
sentences = [
s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
]
# Calculate the importance score for each sentence
sentence_scores = []
for sentence in sentences:
processed_sentence = clean_text(sentence)
score = 0
words = processed_sentence.lower().split()
word_count = Counter(words)
for keyword in keywords:
if keyword.lower() in word_count:
score += word_count[keyword.lower()]
sentence_scores.append((sentence, score))
# Sort sentences by their scores in descending order
sentence_scores.sort(key=lambda x: x[1], reverse=True)
# Return the top N sentences
return [sentence for sentence, score in sentence_scores[:num_sentences]]
def extract_important_phrases(
paragraph: str,
keywords: list[str],
phrase_length: int = 5,
) -> list[str]:
"""
Extracts important phrases based on a list of keywords.
Phrase length is auto-determined, and overlapped parts are less than 20%.
Args:
paragraph (str): The input paragraph.
keywords (list[str]): List of important keywords.
phrase_length (int): Length of phrases to extract (default: 5 words).
Returns:
list: A list of important phrases.
"""
# Tokenize the paragraph into words
words = word_tokenize(paragraph.lower())
# Determine phrase length (between 3 and 7 words)
phrase_length = min(max(len(words) // 10, 5), 7)
# Generate n-grams (phrases) from the paragraph
phrases = list(ngrams(words, phrase_length))
important_phrases = []
used_indices = set()
for i, phrase in enumerate(phrases):
# Check if the phrase contains any keyword
if any(keyword.lower() in phrase for keyword in keywords):
# Check overlap with previously selected phrases
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
important_phrases.append(clean_text(" ".join(phrase)))
used_indices.add(i)
return important_phrases
def extract_equal_text(text1, text2):
def cleanup(text):
text = text.lower()
text = text.translate(str.maketrans("", "", string.punctuation))
return text
splited_text1 = cleanup(text1).split()
splited_text2 = cleanup(text2).split()
s = SequenceMatcher(None, splited_text1, splited_text2)
equal_idx_1 = []
equal_idx_2 = []
text1 = text1.split()
text2 = text2.split()
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == "equal":
equal_idx_1.append({"start": i1, "end": i2})
equal_idx_2.append({"start": j1, "end": j2})
# subtext_1 = " ".join(text1[i1:i2])
# subtext_2 = " ".join(text2[j1:j2])
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}]
# {subtext_1!r:>55} --> {subtext_2!r}')
return equal_idx_1, equal_idx_2
def connect_consecutive_indexes(nums):
"""
Connects consecutive integers in a list.
Args:
nums: A list of integers.
Returns:
A list of lists, where each inner list represents a consecutive range.
"""
if not nums: # Handle empty input
return []
result = []
start = nums[0]
end = nums[0]
for i in range(1, len(nums)):
if nums[i] == end + 1:
end = nums[i]
else:
result.append([start, end])
start = nums[i]
end = nums[i]
result.append([start, end]) # Add the last range
return result