Spaces:
Sleeping
Sleeping
File size: 5,738 Bytes
1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 26e3944 38fd181 26e3944 bfe6692 26e3944 38fd181 26e3944 38fd181 26e3944 38fd181 26e3944 bfe6692 26e3944 38fd181 a6b0abd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import re
import string
from collections import Counter
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
def clean_text(text):
"""Doc cleaning"""
# exclude , and . due to number
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
# Lowering text
text = text.lower()
# Removing punctuation
text = "".join([c for c in text if c not in punctuations])
# Removing whitespace and newlines
text = re.sub(r"\s+", " ", text)
text.replace("£", " * ")
words = text.split()
text = " ".join(words[:18]) # Join the first 18 words back into a string
return text
def remove_punctuation(text):
"""Remove punctuation from a given text."""
punctuation_without_dot = string.punctuation.replace(".", "")
translator = str.maketrans("", "", punctuation_without_dot)
return text.translate(translator)
def get_keywords(text, num_keywords=5):
"""Return top k keywords from a doc using TF-IDF method"""
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]
# Sort words by TF-IDF score
word_scores = list(zip(feature_names, tfidf_scores))
word_scores.sort(key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, score in word_scores[:num_keywords]]
def get_important_sentences(
paragraph: str,
keywords: list[str],
num_sentences: int = 3,
) -> list[str]:
"""
Selects important sentences based on a list of keywords.
Args:
paragraph (str): The input paragraph.
keywords (list[str]): List of important keywords.
num_sentences (int): Number of sentences to return (default is 3).
Returns:
list: A list of important sentences.
"""
# Clean and split the paragraph into sentences
sentences = [
s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
]
# Calculate the importance score for each sentence
sentence_scores = []
for sentence in sentences:
processed_sentence = clean_text(sentence)
score = 0
words = processed_sentence.lower().split()
word_count = Counter(words)
for keyword in keywords:
if keyword.lower() in word_count:
score += word_count[keyword.lower()]
sentence_scores.append((sentence, score))
# Sort sentences by their scores in descending order
sentence_scores.sort(key=lambda x: x[1], reverse=True)
# Return the top N sentences
return [sentence for sentence, score in sentence_scores[:num_sentences]]
def extract_important_phrases(
paragraph: str,
keywords: list[str],
phrase_length: int = 5,
) -> list[str]:
"""
Extracts important phrases based on a list of keywords.
Phrase length is auto-determined, and overlapped parts are less than 20%.
Args:
paragraph (str): The input paragraph.
keywords (list[str]): List of important keywords.
phrase_length (int): Length of phrases to extract (default: 5 words).
Returns:
list: A list of important phrases.
"""
# Tokenize the paragraph into words
words = word_tokenize(paragraph.lower())
# Determine phrase length (between 3 and 7 words)
phrase_length = min(max(len(words) // 10, 5), 7)
# Generate n-grams (phrases) from the paragraph
phrases = list(ngrams(words, phrase_length))
important_phrases = []
used_indices = set()
for i, phrase in enumerate(phrases):
# Check if the phrase contains any keyword
if any(keyword.lower() in phrase for keyword in keywords):
# Check overlap with previously selected phrases
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
important_phrases.append(clean_text(" ".join(phrase)))
used_indices.add(i)
return important_phrases
def extract_equal_text(text1, text2):
def cleanup(text):
text = text.lower()
text = text.translate(str.maketrans("", "", string.punctuation))
return text
splited_text1 = cleanup(text1).split()
splited_text2 = cleanup(text2).split()
s = SequenceMatcher(None, splited_text1, splited_text2)
equal_idx_1 = []
equal_idx_2 = []
text1 = text1.split()
text2 = text2.split()
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == "equal":
equal_idx_1.append({"start": i1, "end": i2})
equal_idx_2.append({"start": j1, "end": j2})
# subtext_1 = " ".join(text1[i1:i2])
# subtext_2 = " ".join(text2[j1:j2])
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}]
# {subtext_1!r:>55} --> {subtext_2!r}')
return equal_idx_1, equal_idx_2
def connect_consecutive_indexes(nums):
"""
Connects consecutive integers in a list.
Args:
nums: A list of integers.
Returns:
A list of lists, where each inner list represents a consecutive range.
"""
if not nums: # Handle empty input
return []
result = []
start = nums[0]
end = nums[0]
for i in range(1, len(nums)):
if nums[i] == end + 1:
end = nums[i]
else:
result.append([start, end])
start = nums[i]
end = nums[i]
result.append([start, end]) # Add the last range
return result
|