Spaces:
Sleeping
Sleeping
File size: 8,291 Bytes
0827f9d 1ce1659 38fd181 1ce1659 38fd181 1ce1659 0827f9d 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 0827f9d 1ce1659 38fd181 0827f9d 1ce1659 0827f9d 38fd181 1ce1659 38fd181 0827f9d 1ce1659 0827f9d 38fd181 0827f9d 1ce1659 38fd181 1ce1659 0827f9d 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 a5e8d12 38fd181 1ce1659 38fd181 1ce1659 a5e8d12 1ce1659 a5e8d12 38fd181 a5e8d12 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 a5e8d12 38fd181 1ce1659 38fd181 1ce1659 a5e8d12 1ce1659 38fd181 1ce1659 a5e8d12 38fd181 1ce1659 38fd181 a5e8d12 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 0827f9d 26e3944 38fd181 26e3944 bfe6692 0827f9d 26e3944 38fd181 0827f9d 26e3944 38fd181 26e3944 0827f9d 26e3944 38fd181 0827f9d 26e3944 0827f9d 0260491 26e3944 38fd181 0827f9d a6b0abd 0827f9d a6b0abd 0827f9d a6b0abd 0827f9d a6b0abd 0827f9d a6b0abd 0827f9d a6b0abd 0827f9d a6b0abd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
"""
Author: Khanh Phan
Date: 2024-12-04
"""
import re
import string
from collections import Counter
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
def clean_text(text: str) -> str:
"""
Cleans and preprocesses a given text string.
Args:
text (str): The input text to be cleaned.
Returns:
str: The cleaned and preprocessed text, containing the first 18 words.
"""
# Define a set of punctuation characters to exclude,
# exclude comma and period due to numbers
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
# Lowering text
text = text.lower()
# Removing punctuation
text = "".join([c for c in text if c not in punctuations])
# Removing whitespace and newlines
text = re.sub(r"\s+", " ", text)
# Replace £ with * because Google search doesn't recognize £
text.replace("£", " * ")
# Split the text into a list of words.
words = text.split()
# Join the first 18 words back into a string
text = " ".join(words[:18]) # TODO: consider another number
return text
def remove_punctuation(text: str) -> str:
"""
Removes all punctuation characters from a string, except for periods (.).
Args:
text (str): The input string.
Returns:
str: The string with all punctuation characters removed,
except for periods.
"""
# Create a string containing all punctuation characters,
# except for periods.
punctuation_without_dot = string.punctuation.replace(".", "")
# Create a translation table to remove the specified punctuation chars.
translator = str.maketrans("", "", punctuation_without_dot)
# Apply the translation table to the input text and return the result.
return text.translate(translator)
def get_keywords(text, num_keywords=5):
"""
Extracts the top k keywords from a document using the TF-IDF method.
Args:
text (str): The input text from which to extract keywords.
num_keywords (int, optional): The number of top keywords to return.
Returns:
list: A list of the top keywords extracted from the text.
"""
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]
# Sort words by TF-IDF score
word_scores = list(zip(feature_names, tfidf_scores))
word_scores.sort(key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, score in word_scores[:num_keywords]]
def get_important_sentences(
sentence: str,
keywords: list[str],
num_sentences: int = 3,
) -> list[str]:
"""
Selects important sentences based on a list of keywords.
Args:
sentence (str): The input sentence.
keywords (list[str]): List of important keywords.
num_sentences (int): Number of sentences to return (default is 3).
Returns:
list: A list of important sentences.
"""
# Clean and split the sentence into sentences
sentences = [
s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
]
# Calculate the importance score for each sentence
sentence_scores = []
for sentence in sentences:
processed_sentence = clean_text(sentence)
score = 0
words = processed_sentence.lower().split()
word_count = Counter(words)
for keyword in keywords:
if keyword.lower() in word_count:
score += word_count[keyword.lower()]
sentence_scores.append((sentence, score))
# Sort sentences by their scores in descending order
sentence_scores.sort(key=lambda x: x[1], reverse=True)
# Return the top N sentences
return [sentence for sentence, score in sentence_scores[:num_sentences]]
def extract_important_phrases(
text: str,
keywords: list[str],
phrase_length: int = 5,
) -> list[str]:
"""
Extracts important phrases based on a list of keywords.
Phrase length is auto-determined, and overlapped parts are less than 20%.
Args:
text (str): The input text.
keywords (list[str]): List of important keywords.
phrase_length (int): Length of phrases to extract (default: 5 words).
Returns:
list: A list of important phrases.
"""
# Tokenize the text into words
words = word_tokenize(text.lower())
# Determine phrase length (between 3 and 7 words)
phrase_length = min(max(len(words) // 10, 5), 7)
# Generate n-grams (phrases) from the text
phrases = list(ngrams(words, phrase_length))
important_phrases = []
used_indices = set()
for i, phrase in enumerate(phrases):
# Check if the phrase contains any keyword
if any(keyword.lower() in phrase for keyword in keywords):
# Check overlap with previously selected phrases
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
important_phrases.append(clean_text(" ".join(phrase)))
used_indices.add(i)
return important_phrases
def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
"""
Extracts the indices of equal text segments between two strings.
Args:
text1 (str): The first input string.
text2 (str): The second input string.
Returns:
tuple[
list[dict{"start": int, "end": int}],
list[dict{"start": int, "end": int}]
]
- list: the start and end indices of equal segments in text1.
- list: the start and end indices of equal segments in text2.
"""
def cleanup(text: str) -> str:
"""
Cleans up a text string by converting to lowercase
and removing punctuation.
Args:
text (str): The input text.
Returns:
str: The cleaned text.
"""
text = text.lower()
text = text.translate(str.maketrans("", "", string.punctuation))
return text
# Clean and split the input texts into lists of words.
splited_text1 = cleanup(text1).split()
splited_text2 = cleanup(text2).split()
# Create a SequenceMatcher object to compare the cleaned word lists.
s = SequenceMatcher(None, splited_text1, splited_text2)
equal_idx_1 = []
equal_idx_2 = []
# Split the original texts into lists of words (without cleaning).
text1 = text1.split()
text2 = text2.split()
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == "equal":
# Append the start and end indices of the equal segment
# to the respective lists.
equal_idx_1.append({"start": i1, "end": i2})
equal_idx_2.append({"start": j1, "end": j2})
# subtext_1 = " ".join(text1[i1:i2])
# subtext_2 = " ".join(text2[j1:j2])
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
# f'{subtext_1!r:>55} --> {subtext_2!r}')
return equal_idx_1, equal_idx_2
def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
"""
Connects consecutive integers in a list.
Args:
nums (list): A list of integers.
Returns:
list: A list of lists,
where each inner list represents a consecutive range.
For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
"""
if not nums: # Handle empty input
return []
result = []
start = nums[0]
end = nums[0]
for i in range(1, len(nums)):
# Check if the current number is consecutive to the previous end.
if nums[i] == end + 1:
end = nums[i] # Extend the current range.
else:
# Add the current range to the result and start a new range.
result.append([start, end])
start = nums[i]
end = nums[i]
# Add the last range to the result.
result.append([start, end])
return result
|