File size: 5,738 Bytes
1ce1659
 
38fd181
 
 
1ce1659
 
38fd181
1ce1659
 
 
 
38fd181
 
 
1ce1659
 
38fd181
1ce1659
 
38fd181
1ce1659
38fd181
 
1ce1659
38fd181
1ce1659
38fd181
 
1ce1659
 
38fd181
1ce1659
 
 
38fd181
1ce1659
 
38fd181
1ce1659
 
38fd181
1ce1659
38fd181
 
1ce1659
 
38fd181
1ce1659
 
38fd181
1ce1659
 
38fd181
1ce1659
 
 
38fd181
1ce1659
 
 
 
38fd181
 
 
 
 
1ce1659
38fd181
1ce1659
 
 
 
 
 
 
 
 
 
38fd181
 
 
 
1ce1659
 
 
 
 
 
 
38fd181
1ce1659
 
 
38fd181
1ce1659
38fd181
1ce1659
 
38fd181
1ce1659
 
 
 
38fd181
 
 
 
 
1ce1659
38fd181
1ce1659
 
 
 
 
38fd181
1ce1659
 
 
 
 
 
38fd181
1ce1659
 
38fd181
1ce1659
 
38fd181
1ce1659
 
38fd181
1ce1659
 
 
 
 
 
 
38fd181
1ce1659
 
38fd181
26e3944
 
 
38fd181
26e3944
bfe6692
26e3944
 
38fd181
26e3944
38fd181
26e3944
 
 
 
 
38fd181
26e3944
 
 
 
bfe6692
 
26e3944
 
38fd181
a6b0abd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import re
import string
from collections import Counter
from difflib import SequenceMatcher

from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer


def clean_text(text):
    """Doc cleaning"""
    # exclude , and . due to number
    punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""

    # Lowering text
    text = text.lower()

    # Removing punctuation
    text = "".join([c for c in text if c not in punctuations])

    # Removing whitespace and newlines
    text = re.sub(r"\s+", " ", text)

    text.replace("£", " * ")

    words = text.split()
    text = " ".join(words[:18])  # Join the first 18 words back into a string

    return text


def remove_punctuation(text):
    """Remove punctuation from a given text."""
    punctuation_without_dot = string.punctuation.replace(".", "")
    translator = str.maketrans("", "", punctuation_without_dot)
    return text.translate(translator)


def get_keywords(text, num_keywords=5):
    """Return top k keywords from a doc using TF-IDF method"""

    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english")

    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get TF-IDF scores
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Sort words by TF-IDF score
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)

    # Return top keywords
    return [word for word, score in word_scores[:num_keywords]]


def get_important_sentences(
    paragraph: str,
    keywords: list[str],
    num_sentences: int = 3,
) -> list[str]:
    """
    Selects important sentences based on a list of keywords.

    Args:
        paragraph (str): The input paragraph.
        keywords (list[str]): List of important keywords.
        num_sentences (int): Number of sentences to return (default is 3).

    Returns:
        list: A list of important sentences.
    """
    # Clean and split the paragraph into sentences
    sentences = [
        s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
    ]

    # Calculate the importance score for each sentence
    sentence_scores = []
    for sentence in sentences:
        processed_sentence = clean_text(sentence)
        score = 0
        words = processed_sentence.lower().split()
        word_count = Counter(words)

        for keyword in keywords:
            if keyword.lower() in word_count:
                score += word_count[keyword.lower()]

        sentence_scores.append((sentence, score))

    # Sort sentences by their scores in descending order
    sentence_scores.sort(key=lambda x: x[1], reverse=True)

    # Return the top N sentences
    return [sentence for sentence, score in sentence_scores[:num_sentences]]


def extract_important_phrases(
    paragraph: str,
    keywords: list[str],
    phrase_length: int = 5,
) -> list[str]:
    """
    Extracts important phrases based on a list of keywords.
    Phrase length is auto-determined, and overlapped parts are less than 20%.

    Args:
        paragraph (str): The input paragraph.
        keywords (list[str]): List of important keywords.
        phrase_length (int): Length of phrases to extract (default: 5 words).

    Returns:
        list: A list of important phrases.
    """
    # Tokenize the paragraph into words
    words = word_tokenize(paragraph.lower())

    # Determine phrase length (between 3 and 7 words)
    phrase_length = min(max(len(words) // 10, 5), 7)

    # Generate n-grams (phrases) from the paragraph
    phrases = list(ngrams(words, phrase_length))

    important_phrases = []
    used_indices = set()

    for i, phrase in enumerate(phrases):
        # Check if the phrase contains any keyword
        if any(keyword.lower() in phrase for keyword in keywords):
            # Check overlap with previously selected phrases
            if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
                important_phrases.append(clean_text(" ".join(phrase)))
                used_indices.add(i)

    return important_phrases


def extract_equal_text(text1, text2):
    def cleanup(text):
        text = text.lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        return text

    splited_text1 = cleanup(text1).split()
    splited_text2 = cleanup(text2).split()

    s = SequenceMatcher(None, splited_text1, splited_text2)

    equal_idx_1 = []
    equal_idx_2 = []
    text1 = text1.split()
    text2 = text2.split()
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == "equal":
            equal_idx_1.append({"start": i1, "end": i2})
            equal_idx_2.append({"start": j1, "end": j2})
            # subtext_1 = " ".join(text1[i1:i2])
            # subtext_2 = " ".join(text2[j1:j2])
            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}]
            # {subtext_1!r:>55} --> {subtext_2!r}')
    return equal_idx_1, equal_idx_2


def connect_consecutive_indexes(nums):
    """
    Connects consecutive integers in a list.

    Args:
        nums: A list of integers.

    Returns:
        A list of lists, where each inner list represents a consecutive range.
    """

    if not nums:  # Handle empty input
        return []

    result = []
    start = nums[0]
    end = nums[0]

    for i in range(1, len(nums)):
        if nums[i] == end + 1:
            end = nums[i]
        else:
            result.append([start, end])
            start = nums[i]
            end = nums[i]

    result.append([start, end])  # Add the last range
    return result