pmkhanh7890's picture
refactor code + fix bug of label after grouping url
00b1038
raw
history blame
15.3 kB
"""
Author: Khanh Phan
Date: 2024-12-04
"""
import re
import string
from collections import Counter
from difflib import SequenceMatcher
from nltk.tokenize import (
sent_tokenize,
word_tokenize,
)
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from src.application.config import PREFIX
def clean_text(text: str) -> str:
"""
Cleans and preprocesses a given text string.
Args:
text (str): The input text to be cleaned.
Returns:
str: The cleaned and preprocessed text, containing the first 18 words.
"""
# Define a set of punctuation characters to exclude,
# exclude comma and period due to numbers
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
# Lowering text
text = text.lower()
# Removing punctuation
text = "".join([c for c in text if c not in punctuations])
# Removing whitespace and newlines
text = re.sub(r"\s+", " ", text)
# Replace £ with * because Google search doesn't recognize £
text.replace("£", " * ")
# Split the text into a list of words.
words = text.split()
# Join the first 18 words back into a string
text = " ".join(words[:18]) # TODO: consider another number
return text
def remove_punctuation(text: str) -> str:
"""
Removes all punctuation characters from a string, except for periods (.).
Args:
text (str): The input string.
Returns:
str: The string with all punctuation characters removed,
except for periods.
"""
# Create a string containing all punctuation characters,
# except for periods.
punctuation_without_dot = string.punctuation.replace(".", "")
# Create a translation table to remove the specified punctuation chars.
translator = str.maketrans("", "", punctuation_without_dot)
# Apply the translation table to the input text and return the result.
return text.translate(translator)
def get_keywords(text, num_keywords=5):
"""
Extracts the top k keywords from a document using the TF-IDF method.
Args:
text (str): The input text from which to extract keywords.
num_keywords (int, optional): The number of top keywords to return.
Returns:
list: A list of the top keywords extracted from the text.
"""
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")
# Fit and transform the text
tfidf_matrix = vectorizer.fit_transform([text])
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Get TF-IDF scores
tfidf_scores = tfidf_matrix.toarray()[0]
# Sort words by TF-IDF score
word_scores = list(zip(feature_names, tfidf_scores))
word_scores.sort(key=lambda x: x[1], reverse=True)
# Return top keywords
return [word for word, score in word_scores[:num_keywords]]
def get_important_sentences(
sentence: str,
keywords: list[str],
num_sentences: int = 3,
) -> list[str]:
"""
Selects important sentences based on a list of keywords.
Args:
sentence (str): The input sentence.
keywords (list[str]): List of important keywords.
num_sentences (int): Number of sentences to return (default is 3).
Returns:
list: A list of important sentences.
"""
# Clean and split the sentence into sentences
sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s]
# Calculate the importance score for each sentence
sentence_scores = []
for sentence in sentences:
processed_sentence = clean_text(sentence)
score = 0
words = processed_sentence.lower().split()
word_count = Counter(words)
for keyword in keywords:
if keyword.lower() in word_count:
score += word_count[keyword.lower()]
sentence_scores.append((sentence, score))
# Sort sentences by their scores in descending order
sentence_scores.sort(key=lambda x: x[1], reverse=True)
# Return the top N sentences
return [sentence for sentence, score in sentence_scores[:num_sentences]]
def extract_important_phrases(
text: str,
keywords: list[str],
phrase_length: int = 5,
) -> list[str]:
"""
Extracts important phrases based on a list of keywords.
Phrase length is auto-determined, and overlapped parts are less than 20%.
Args:
text (str): The input text.
keywords (list[str]): List of important keywords.
phrase_length (int): Length of phrases to extract (default: 5 words).
Returns:
list: A list of important phrases.
"""
# Tokenize the text into words
words = word_tokenize(text.lower())
# Determine phrase length (between 3 and 7 words)
phrase_length = min(max(len(words) // 10, 5), 7)
# Generate n-grams (phrases) from the text
phrases = list(ngrams(words, phrase_length))
important_phrases = []
used_indices = set()
for i, phrase in enumerate(phrases):
# Check if the phrase contains any keyword
if any(keyword.lower() in phrase for keyword in keywords):
# Check overlap with previously selected phrases
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
important_phrases.append(clean_text(" ".join(phrase)))
used_indices.add(i)
return important_phrases
def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
"""
Extracts the indices of equal text segments between two strings.
Args:
text1 (str): The first input string.
text2 (str): The second input string.
Returns:
tuple[
list[dict{"start": int, "end": int}],
list[dict{"start": int, "end": int}]
]
- list: the start and end indices of equal segments in text1.
- list: the start and end indices of equal segments in text2.
"""
def cleanup(text: str) -> str:
"""
Cleans up a text string by converting to lowercase
and removing punctuation.
Args:
text (str): The input text.
Returns:
str: The cleaned text.
"""
text = text.lower()
text = text.translate(str.maketrans("", "", string.punctuation))
return text
# Clean and split the input texts into lists of words.
splited_text1 = cleanup(text1).split()
splited_text2 = cleanup(text2).split()
# Create a SequenceMatcher object to compare the cleaned word lists.
s = SequenceMatcher(None, splited_text1, splited_text2)
equal_idx_1 = []
equal_idx_2 = []
# Split the original texts into lists of words (without cleaning).
text1 = text1.split()
text2 = text2.split()
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == "equal":
# Append the start and end indices of the equal segment
# to the respective lists.
equal_idx_1.append({"start": i1, "end": i2})
equal_idx_2.append({"start": j1, "end": j2})
# subtext_1 = " ".join(text1[i1:i2])
# subtext_2 = " ".join(text2[j1:j2])
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
# f'{subtext_1!r:>55} --> {subtext_2!r}')
return equal_idx_1, equal_idx_2
def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
"""
Connects consecutive integers in a list.
Args:
nums (list): A list of integers.
Returns:
list: A list of lists,
where each inner list represents a consecutive range.
For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
"""
if not nums: # Handle empty input
return []
result = []
start = nums[0]
end = nums[0]
for i in range(1, len(nums)):
# Check if the current number is consecutive to the previous end.
if nums[i] == end + 1:
end = nums[i] # Extend the current range.
else:
# Add the current range to the result and start a new range.
result.append([start, end])
start = nums[i]
end = nums[i]
# Add the last range to the result.
result.append([start, end])
return result
def postprocess_label(labels: list[str]) -> str:
"""
Creates a label string with the format
"Partially generated by [label1] and [label2] and ...".
Removes duplicate labels while preserving the original order.
Args:
labels: A list of strings representing labels.
Returns:
A string with the formatted label.
"""
for index, label in enumerate(labels):
# if label.startswith(PREFIX):
# labels[index] = label[len(PREFIX) :]
if PREFIX in label:
labels[index] = label.replace(PREFIX, "")
labels = list(set(labels))
label = ""
if len(labels) == 1:
label += labels[0]
elif len(labels) == 2:
label += f"{labels[0]} and {labels[1]}"
else:
combination = ", ".join(labels[0 : len(labels) - 1])
label += f"{combination}, and {labels[-1]}"
return label
def split_into_sentences(input_text: str) -> list[str]:
"""
Splits input text into sentences by newlines
and then tokenizes each paragraph into sentences.
Args:
input_text (str): The input text as a string.
Returns:
list: A list of sentences.
Returns an empty list if input is not a string.
"""
if not isinstance(input_text, str):
return []
# Split the input text into paragraphs based on newline characters,
# keeping the newline characters.
paragraphs = input_text.splitlines(keepends=True)
sentences = []
for paragraph in paragraphs:
# Remove leading/trailing whitespace
paragraph = paragraph.strip()
if paragraph and paragraph != "\n":
# Tokenize the paragraph into sentences
sentences.extend(sent_tokenize(paragraph))
return sentences
def split_into_paragraphs(input_text: str) -> list[str]:
"""
Splits input text into paragraphs based on newline characters.
Args:
input_text (str): The input text as a string.
Returns:
list: A list of paragraphs.
Returns an empty list if input is not a string.
"""
if not isinstance(input_text, str):
return []
# Split the input text into paragraphs based on newline characters,
# keeping the newline characters.
paragraphs = input_text.splitlines(keepends=True)
out_paragraphs = []
for paragraph in paragraphs:
# Remove leading/trailing whitespace
# paragraph = paragraph.strip()
if paragraph and paragraph != "\n":
# Append the cleaned paragraph to the output list.
out_paragraphs.append(paragraph)
return out_paragraphs
def extract_starts_ends(
colored_idx: list[dict],
) -> tuple[list[int], list[int]]:
"""
Extracts start and end indices from a list of dictionaries.
Args:
colored_idx (list[dict]): A list of dictionaries,
where each dictionary has 'start' and 'end' keys.
Returns:
tuple: A tuple containing two lists:
- starts (list[int]): A list of start indices.
- ends (list[int]): A list of end indices.
"""
starts = []
ends = []
for index in colored_idx:
starts.append(index["start"])
ends.append(index["end"])
return starts, ends
def filter_indices(
starts: list[int],
ends: list[int],
ignore_indices: list[int],
):
"""
Filters start and end indices to exclude any indices present in the
ignore_indices list.
Args:
starts (list[int]): A list of starting indices.
ends (list[int]): A list of ending indices.
Must be the same length as starts.
ignore_indices (list[int]): A list of indices to exclude.
Returns:
A tuple of two lists of integers:
- filtered_starts
- filtered_ends
Returns empty lists if the input is invalid
or if all ranges are filtered out.
Examples:
starts = [0, 5, 10]
ends = [3, 7, 12] # words at the end will not be colored.
ignore_indices = [1, 2, 12, 17]
# Output:
starts = [0, 3, 5, 10]
ends = [1, 4, 7, 12]
"""
if len(starts) != len(ends):
print(
"Error: The 'starts' & 'ends' lists must have the same length.",
)
return [], []
filtered_starts = []
filtered_ends = []
for i in range(len(starts)):
start = starts[i]
end = ends[i]
if end < start:
print(
f"Error: End index {end} < start index {start} at position {i}.", # noqa: E501
)
return [], []
start_end = list(range(start, end + 1, 1))
start_end = list(set(start_end) - set(ignore_indices))
# new_start, new_end = self.extract_sequences(start_end)
new_start, new_end = extract_new_startend(
start,
end,
ignore_indices,
)
filtered_starts.extend(new_start)
filtered_ends.extend(new_end)
return filtered_starts, filtered_ends
def replace_leading_spaces(text: str) -> str:
"""
Replaces leading spaces in a string with '&nbsp;'.
Args:
text: The input string.
Returns:
The string with leading spaces replaced by '&nbsp;'.
"""
if text is None:
return None
leading_spaces = 0
for char in text:
if char == " ":
leading_spaces += 1
else:
break
if leading_spaces > 0:
return "&nbsp;" * leading_spaces + text[leading_spaces:]
else:
return text
def extract_new_startend(
start: int,
end: int,
ignore_indices: list[int],
) -> tuple[list[int], list[int]]:
"""
Extracts new start and end indices by splitting a range based on
ignored indices.
Args:
start (int): The starting index of the range.
end (int): The ending index of the range (exclusive).
ignore_indices (list): indices to ignore within the range.
Returns:
tuple: A tuple containing two lists:
- new_starts (list): Starting indices for the sub-ranges.
- new_ends (list): Ending indices for the sub-ranges.
"""
# Sort the set of ignore_indices in ascending order.
indexes = list(set(ignore_indices))
indexes.sort()
new_starts = []
new_ends = []
new_start = start
# If no indices to ignore, return the original range.
if indexes is None or len(indexes) < 1:
new_starts.append(start)
new_ends.append(end)
return new_starts, new_ends
for index in indexes:
# Skip indices that are outside the range [start, end).
if index < start:
continue
elif index >= end:
continue
new_starts.append(new_start)
new_ends.append(index)
new_start = index + 1
new_starts.append(new_start)
new_ends.append(end)
return new_starts, new_ends