Spaces:
Sleeping
Sleeping
Commit
·
0827f9d
1
Parent(s):
a5e8d12
Add comments to text module
Browse files- application.py +2 -2
- gpt_test.py +1 -1
- src/application/config.py +84 -0
- src/application/content_detection.py +12 -14
- src/application/text/entity.py +238 -99
- src/application/text/helper.py +96 -16
- src/application/text/highlight_text.py +0 -202
- src/application/text/model_detection.py +69 -54
- src/application/text/preprocessing.py +31 -10
- src/application/text/search.py +84 -42
- src/application/text/search_detection.py +85 -183
- test.py +3 -74
application.py
CHANGED
@@ -125,7 +125,7 @@ FOR GOVERNOR<br>
|
|
125 |
- Each highlighted pair (marked with a number) shows the key differences
|
126 |
between the input text and the source.
|
127 |
"""
|
128 |
-
table =
|
129 |
<h5>Comparison between input news and source news:</h5>
|
130 |
<table border="1" style="width:100%; text-align:left;">
|
131 |
<col style="width: 170px;">
|
@@ -144,7 +144,7 @@ between the input text and the source.
|
|
144 |
<tr>
|
145 |
<td style="border-bottom: 1px solid transparent";>TBD</td>
|
146 |
<td style="border-bottom: 1px solid transparent";>TBD</td>
|
147 |
-
<td rowspan="2">
|
148 |
<td rowspan="2">TBD</td>
|
149 |
</tr>
|
150 |
<tr>
|
|
|
125 |
- Each highlighted pair (marked with a number) shows the key differences
|
126 |
between the input text and the source.
|
127 |
"""
|
128 |
+
table = """
|
129 |
<h5>Comparison between input news and source news:</h5>
|
130 |
<table border="1" style="width:100%; text-align:left;">
|
131 |
<col style="width: 170px;">
|
|
|
144 |
<tr>
|
145 |
<td style="border-bottom: 1px solid transparent";>TBD</td>
|
146 |
<td style="border-bottom: 1px solid transparent";>TBD</td>
|
147 |
+
<td rowspan="2">TBD</td>
|
148 |
<td rowspan="2">TBD</td>
|
149 |
</tr>
|
150 |
<tr>
|
gpt_test.py
CHANGED
@@ -96,7 +96,7 @@ azure_client = AzureOpenAI(
|
|
96 |
api_version="2024-05-01-preview",
|
97 |
)
|
98 |
|
99 |
-
deplopment_name = "gpt-4o"
|
100 |
TEXT_PROMPT = """
|
101 |
Paraphrase the following news, only output the paraphrased text:
|
102 |
|
|
|
96 |
api_version="2024-05-01-preview",
|
97 |
)
|
98 |
|
99 |
+
deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
100 |
TEXT_PROMPT = """
|
101 |
Paraphrase the following news, only output the paraphrased text:
|
102 |
|
src/application/config.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Download necessary NLTK data files
|
2 |
+
"""
|
3 |
+
Author: Khanh Phan
|
4 |
+
Date: 2024-12-04
|
5 |
+
"""
|
6 |
+
import os
|
7 |
+
|
8 |
+
import nltk
|
9 |
+
import openai
|
10 |
+
import torch
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from sentence_transformers import SentenceTransformer
|
13 |
+
|
14 |
+
# Load environment variables
|
15 |
+
load_dotenv()
|
16 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
17 |
+
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
|
18 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
19 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
20 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
21 |
+
|
22 |
+
# GPT Model
|
23 |
+
GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini"
|
24 |
+
GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
|
25 |
+
AZUREOPENAI_CLIENT = openai.AzureOpenAI(
|
26 |
+
api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION,
|
27 |
+
api_key=AZURE_OPENAI_API_KEY,
|
28 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
29 |
+
)
|
30 |
+
|
31 |
+
# Download the resources
|
32 |
+
nltk.download("punkt", quiet=True) # Sentence tokenization
|
33 |
+
nltk.download("punkt_tab", quiet=True) # Tokenization with tab-separated data
|
34 |
+
nltk.download("stopwords", quiet=True) # A list of stop words
|
35 |
+
STOPWORDS_LANG = "english"
|
36 |
+
|
37 |
+
# Load PARAPHASE_MODEL
|
38 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
39 |
+
PARAPHRASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
40 |
+
PARAPHRASE_MODEL.to(DEVICE)
|
41 |
+
|
42 |
+
# Model to detect AI-generated text
|
43 |
+
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
|
44 |
+
|
45 |
+
# Thresholds
|
46 |
+
PARAPHRASE_THRESHOLD_HUMAN = 0.963
|
47 |
+
PARAPHRASE_THRESHOLD_MACHINE = 0.8
|
48 |
+
PARAPHRASE_THRESHOLD = 0.8
|
49 |
+
|
50 |
+
MIN_SAME_SENTENCE_LEN = 6
|
51 |
+
MIN_PHRASE_SENTENCE_LEN = 10
|
52 |
+
MIN_RATIO_PARAPHRASE_NUM = 0.5
|
53 |
+
MAX_CHAR_SIZE = 30000
|
54 |
+
|
55 |
+
# Number of top URLs per search
|
56 |
+
TOP_URLS_PER_SEARCH = 3
|
57 |
+
|
58 |
+
# Search parameters
|
59 |
+
GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
|
60 |
+
TOP_SEARCH_RESUTLS = 10
|
61 |
+
CHUNK_SIZE = 32 # words
|
62 |
+
NUM_CHUNKS = 3 # number of chunks to search
|
63 |
+
NUM_FREQUENT_WORDS = 32 # number of top words to return
|
64 |
+
NUM_KEYWORDS = 5 # number of keywords to return
|
65 |
+
|
66 |
+
# Labels
|
67 |
+
MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
|
68 |
+
HUMAN = "HUMAN"
|
69 |
+
MACHINE = "MACHINE"
|
70 |
+
UNKNOWN = "UNKNOWN"
|
71 |
+
PARAPHRASE = "PARAPHRASE"
|
72 |
+
NON_PARAPHRASE = "NON_PARAPHRASE"
|
73 |
+
|
74 |
+
# Entity color
|
75 |
+
"""
|
76 |
+
factor > 1: Lightens the color.
|
77 |
+
factor = 1: Leaves the color unchanged.
|
78 |
+
factor < 1: Darkens the color.
|
79 |
+
factor = 0: Black.
|
80 |
+
"""
|
81 |
+
ENTITY_LIGHTEN_COLOR = 2.2
|
82 |
+
ENTITY_DARKEN_COLOR = 0.7
|
83 |
+
ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness).
|
84 |
+
ENTITY_BRIGHTNESS = 0.75 # color's brightness.
|
src/application/content_detection.py
CHANGED
@@ -16,7 +16,7 @@ from src.application.text.model_detection import (
|
|
16 |
detect_text_by_ai_model,
|
17 |
predict_generation_model,
|
18 |
)
|
19 |
-
from src.application.text.preprocessing import split_into_paragraphs
|
20 |
from src.application.text.search_detection import (
|
21 |
PARAPHRASE_THRESHOLD_MACHINE,
|
22 |
find_sentence_source,
|
@@ -112,7 +112,7 @@ class NewsVerification:
|
|
112 |
na=False,
|
113 |
)
|
114 |
]
|
115 |
-
|
116 |
if len(machine_label) > 0:
|
117 |
label = " ".join(machine_label["label"].tolist())
|
118 |
self.text_prediction_label[0] = label
|
@@ -147,7 +147,7 @@ class NewsVerification:
|
|
147 |
print("CHECK TEXT:")
|
148 |
print("\tFrom search engine:")
|
149 |
# Classify by search engine
|
150 |
-
#input_sentences = split_into_sentences(self.news_text)
|
151 |
input_paragraphs = split_into_paragraphs(self.news_text)
|
152 |
|
153 |
# Setup df for input_sentences
|
@@ -402,8 +402,6 @@ class NewsVerification:
|
|
402 |
if span_row == 1:
|
403 |
last_url_row = True
|
404 |
|
405 |
-
# end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
|
406 |
-
|
407 |
formatted_row = self.format_text_fact_checker_row(
|
408 |
row,
|
409 |
first_url_row,
|
@@ -873,11 +871,11 @@ class NewsVerification:
|
|
873 |
|
874 |
start_end = list(range(start, end + 1, 1))
|
875 |
start_end = list(set(start_end) - set(ignore_indices))
|
876 |
-
#new_start, new_end = self.extract_sequences(start_end)
|
877 |
new_start, new_end = self.extract_new_startend(
|
878 |
-
start,
|
879 |
-
end,
|
880 |
-
ignore_indices
|
881 |
)
|
882 |
filtered_starts.extend(new_start)
|
883 |
filtered_ends.extend(new_end)
|
@@ -885,7 +883,7 @@ class NewsVerification:
|
|
885 |
return filtered_starts, filtered_ends
|
886 |
|
887 |
def extract_new_startend(self, start, end, ignore_indices):
|
888 |
-
# sort a set of ignore_indices
|
889 |
indexes = list(set(ignore_indices))
|
890 |
indexes.sort()
|
891 |
|
@@ -896,22 +894,22 @@ class NewsVerification:
|
|
896 |
new_starts.append(start)
|
897 |
new_ends.append(end)
|
898 |
return new_starts, new_ends
|
899 |
-
|
900 |
for index in indexes:
|
901 |
if index < start:
|
902 |
continue
|
903 |
elif index >= end:
|
904 |
continue
|
905 |
-
|
906 |
new_starts.append(new_start)
|
907 |
new_ends.append(index)
|
908 |
|
909 |
new_start = index + 1
|
910 |
-
|
911 |
new_starts.append(new_start)
|
912 |
new_ends.append(end)
|
913 |
|
914 |
-
return new_starts, new_ends
|
915 |
|
916 |
def extract_sequences(self, numbers):
|
917 |
if len(numbers) == 1:
|
|
|
16 |
detect_text_by_ai_model,
|
17 |
predict_generation_model,
|
18 |
)
|
19 |
+
from src.application.text.preprocessing import split_into_paragraphs
|
20 |
from src.application.text.search_detection import (
|
21 |
PARAPHRASE_THRESHOLD_MACHINE,
|
22 |
find_sentence_source,
|
|
|
112 |
na=False,
|
113 |
)
|
114 |
]
|
115 |
+
|
116 |
if len(machine_label) > 0:
|
117 |
label = " ".join(machine_label["label"].tolist())
|
118 |
self.text_prediction_label[0] = label
|
|
|
147 |
print("CHECK TEXT:")
|
148 |
print("\tFrom search engine:")
|
149 |
# Classify by search engine
|
150 |
+
# input_sentences = split_into_sentences(self.news_text)
|
151 |
input_paragraphs = split_into_paragraphs(self.news_text)
|
152 |
|
153 |
# Setup df for input_sentences
|
|
|
402 |
if span_row == 1:
|
403 |
last_url_row = True
|
404 |
|
|
|
|
|
405 |
formatted_row = self.format_text_fact_checker_row(
|
406 |
row,
|
407 |
first_url_row,
|
|
|
871 |
|
872 |
start_end = list(range(start, end + 1, 1))
|
873 |
start_end = list(set(start_end) - set(ignore_indices))
|
874 |
+
# new_start, new_end = self.extract_sequences(start_end)
|
875 |
new_start, new_end = self.extract_new_startend(
|
876 |
+
start,
|
877 |
+
end,
|
878 |
+
ignore_indices,
|
879 |
)
|
880 |
filtered_starts.extend(new_start)
|
881 |
filtered_ends.extend(new_end)
|
|
|
883 |
return filtered_starts, filtered_ends
|
884 |
|
885 |
def extract_new_startend(self, start, end, ignore_indices):
|
886 |
+
# sort a set of ignore_indices
|
887 |
indexes = list(set(ignore_indices))
|
888 |
indexes.sort()
|
889 |
|
|
|
894 |
new_starts.append(start)
|
895 |
new_ends.append(end)
|
896 |
return new_starts, new_ends
|
897 |
+
|
898 |
for index in indexes:
|
899 |
if index < start:
|
900 |
continue
|
901 |
elif index >= end:
|
902 |
continue
|
903 |
+
|
904 |
new_starts.append(new_start)
|
905 |
new_ends.append(index)
|
906 |
|
907 |
new_start = index + 1
|
908 |
+
|
909 |
new_starts.append(new_start)
|
910 |
new_ends.append(end)
|
911 |
|
912 |
+
return new_starts, new_ends
|
913 |
|
914 |
def extract_sequences(self, numbers):
|
915 |
if len(numbers) == 1:
|
src/application/text/entity.py
CHANGED
@@ -1,34 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import colorsys
|
2 |
import json
|
3 |
-
import os
|
4 |
import re
|
5 |
|
6 |
import gradio as gr
|
7 |
import openai
|
8 |
-
from dotenv import load_dotenv
|
9 |
from transformers import pipeline
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
client = openai.AzureOpenAI(
|
19 |
-
api_version="2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
|
20 |
-
api_key=AZURE_OPENAI_API_KEY,
|
21 |
-
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
22 |
)
|
23 |
|
|
|
|
|
24 |
|
25 |
def extract_entities_gpt(
|
26 |
original_text,
|
27 |
compared_text,
|
28 |
-
text_generation_model=
|
29 |
-
):
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
prompt = f"""
|
33 |
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
34 |
Find entity pairs with significantly different meanings after paraphrasing.
|
@@ -60,14 +75,15 @@ If there are no entities that satisfy above condition, output empty list "[]".
|
|
60 |
{compared_text}
|
61 |
"""
|
62 |
|
63 |
-
# Generate text using the text generation model
|
64 |
# Generate text using the selected model
|
65 |
try:
|
66 |
-
|
|
|
67 |
model=text_generation_model,
|
68 |
messages=[{"role": "user", "content": prompt}],
|
69 |
)
|
70 |
|
|
|
71 |
res = response.choices[0].message.content
|
72 |
|
73 |
except openai.OpenAIError as e:
|
@@ -77,15 +93,27 @@ If there are no entities that satisfy above condition, output empty list "[]".
|
|
77 |
return res
|
78 |
|
79 |
|
80 |
-
def read_json(json_string) -> list[list[str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
try:
|
|
|
82 |
entities = json.loads(json_string)
|
|
|
83 |
# Remove duplicates pair of entities
|
84 |
unique_entities = []
|
85 |
for inner_list in entities:
|
|
|
86 |
if inner_list not in unique_entities:
|
87 |
unique_entities.append(inner_list)
|
88 |
-
|
89 |
return unique_entities
|
90 |
|
91 |
except json.JSONDecodeError as e:
|
@@ -93,66 +121,94 @@ def read_json(json_string) -> list[list[str]]:
|
|
93 |
return []
|
94 |
|
95 |
|
96 |
-
def
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
int(hex_color[2:4], 16),
|
103 |
-
int(hex_color[4:6], 16),
|
104 |
-
)
|
105 |
-
|
106 |
-
# Convert to HSV
|
107 |
-
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
108 |
-
v = min(1.0, v * factor) # Increase brightness
|
109 |
-
|
110 |
-
# Convert back to HEX
|
111 |
-
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
112 |
-
return f"#{r:02x}{g:02x}{b:02x}"
|
113 |
-
|
114 |
|
115 |
-
|
116 |
-
|
|
|
117 |
|
|
|
|
|
|
|
|
|
118 |
hex_color = hex_color.lstrip("#")
|
|
|
|
|
119 |
r, g, b = (
|
120 |
-
int(hex_color[0:2], 16),
|
121 |
-
int(hex_color[2:4], 16),
|
122 |
-
int(hex_color[4:6], 16),
|
123 |
)
|
124 |
|
125 |
-
# Convert to HSV
|
126 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
127 |
-
v = max(0, v * factor) # Reduce brightness
|
128 |
|
129 |
-
#
|
|
|
|
|
|
|
130 |
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
|
|
|
|
131 |
return f"#{r:02x}{g:02x}{b:02x}"
|
132 |
|
133 |
|
134 |
-
def
|
135 |
-
"""
|
|
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
hue = index / total_colors # Spread hues in range [0,1]
|
138 |
-
saturation = 0.65 # Keep colors vivid
|
139 |
-
lightness = 0.75 # Balanced brightness
|
140 |
|
141 |
# Convert HSL to RGB
|
142 |
-
r, g, b = colorsys.hls_to_rgb(hue,
|
|
|
|
|
143 |
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
144 |
|
145 |
-
|
|
|
|
|
146 |
|
|
|
|
|
|
|
147 |
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
total_colors = len(entities)
|
150 |
-
|
|
|
151 |
entities_colors = []
|
152 |
for index, entity in enumerate(entities):
|
153 |
-
color =
|
154 |
|
155 |
-
#
|
156 |
entities_colors.append(
|
157 |
{"color": color, "input": entity[0], "source": entity[1]},
|
158 |
)
|
@@ -160,43 +216,83 @@ def assign_colors_to_entities(entities):
|
|
160 |
return entities_colors
|
161 |
|
162 |
|
163 |
-
def highlight_entities(text1, text2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
if text1 is None or text2 is None:
|
165 |
return None
|
166 |
|
|
|
167 |
entities_text = extract_entities_gpt(text1, text2)
|
168 |
|
169 |
-
# Clean up entities
|
170 |
entities_text = entities_text.replace("```json", "").replace("```", "")
|
171 |
|
|
|
172 |
entities = read_json(entities_text)
|
|
|
|
|
173 |
if len(entities) == 0:
|
174 |
return None
|
175 |
|
176 |
-
# Assign colors to entities
|
177 |
entities_with_colors = assign_colors_to_entities(entities)
|
178 |
|
179 |
return entities_with_colors
|
180 |
|
181 |
|
182 |
-
def apply_highlight(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
if entities_with_colors is None:
|
184 |
return text, []
|
185 |
|
|
|
186 |
all_starts = []
|
187 |
all_ends = []
|
188 |
highlighted_text = ""
|
189 |
temp_text = text
|
|
|
|
|
190 |
for index, entity in enumerate(entities_with_colors):
|
191 |
highlighted_text = ""
|
192 |
|
193 |
-
# find a list of starts and ends of entity in text:
|
194 |
-
# starts = [m.start() for m in re.finditer(entity[key], temp_text)]
|
195 |
-
# ends = [m.end() for m in re.finditer(entity[key], temp_text)]
|
196 |
starts = []
|
197 |
ends = []
|
198 |
-
|
199 |
for m in re.finditer(
|
|
|
200 |
r"\b" + re.escape(entity[key]) + r"\b",
|
201 |
temp_text,
|
202 |
):
|
@@ -206,78 +302,116 @@ def apply_highlight(text, entities_with_colors, key="input", count=0):
|
|
206 |
all_starts.extend(starts)
|
207 |
all_ends.extend(ends)
|
208 |
|
|
|
209 |
color = entities_with_colors[index]["color"]
|
210 |
-
|
|
|
|
|
211 |
color,
|
212 |
-
|
213 |
-
)
|
214 |
-
|
|
|
215 |
entity_color,
|
216 |
-
|
217 |
-
)
|
218 |
|
219 |
-
# Apply highlighting to each entity
|
220 |
prev_end = 0
|
221 |
for start, end in zip(starts, ends):
|
222 |
-
#
|
223 |
highlighted_text += temp_text[prev_end:start]
|
224 |
|
225 |
-
#
|
226 |
index_label = (
|
227 |
f'<span_style="background-color:{label_color};color:white;'
|
228 |
f"padding:1px_4px;border-radius:4px;font-size:12px;"
|
229 |
f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>' # noqa: E501
|
230 |
)
|
231 |
|
232 |
-
#
|
233 |
highlighted_text += (
|
234 |
f'<span_style="background-color:{entity_color};color:black;'
|
235 |
f'border-radius:3px;font-size:14px;display:inline-block;">'
|
236 |
f"{index_label}{temp_text[start:end]}</span>"
|
237 |
)
|
238 |
prev_end = end
|
|
|
|
|
239 |
highlighted_text += temp_text[prev_end:]
|
|
|
|
|
240 |
temp_text = highlighted_text
|
241 |
|
242 |
if highlighted_text == "":
|
243 |
return text, []
|
|
|
|
|
244 |
highlight_idx_list = get_index_list(highlighted_text)
|
245 |
return highlighted_text, highlight_idx_list
|
246 |
|
247 |
|
248 |
-
def get_index_list(highlighted_text):
|
249 |
"""
|
250 |
-
Generates a list of indices
|
251 |
|
252 |
Args:
|
253 |
-
|
254 |
-
|
255 |
|
256 |
Returns:
|
257 |
-
A list
|
258 |
-
|
259 |
-
end < start, etc.).
|
260 |
"""
|
261 |
highlighted_index = []
|
|
|
|
|
262 |
words = highlighted_text.split()
|
263 |
for index, word in enumerate(words):
|
|
|
264 |
if word.startswith("<span_style"):
|
265 |
start_index = index
|
|
|
|
|
266 |
if word.endswith("</span>"):
|
267 |
end_index = index
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
return highlighted_index
|
272 |
|
273 |
|
274 |
-
def extract_entities(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
output = ner_pipeline(text)
|
|
|
|
|
276 |
words = extract_words(output)
|
|
|
|
|
277 |
words = combine_subwords(words)
|
278 |
|
279 |
-
#
|
280 |
-
# connect words if there is no space between them
|
281 |
entities = []
|
282 |
for entity in words:
|
283 |
if entity not in entities:
|
@@ -286,15 +420,17 @@ def extract_entities(text):
|
|
286 |
return entities
|
287 |
|
288 |
|
289 |
-
def extract_words(entities):
|
290 |
"""
|
291 |
Extracts the words from a list of entities.
|
292 |
|
293 |
Args:
|
294 |
-
|
|
|
|
|
295 |
|
296 |
Returns:
|
297 |
-
|
298 |
"""
|
299 |
words = []
|
300 |
for entity in entities:
|
@@ -307,24 +443,26 @@ def combine_subwords(word_list):
|
|
307 |
Combines subwords (indicated by "##") with the preceding word in a list.
|
308 |
|
309 |
Args:
|
310 |
-
|
|
|
311 |
|
312 |
Returns:
|
313 |
-
|
|
|
314 |
"""
|
315 |
result = []
|
316 |
i = 0
|
317 |
while i < len(word_list):
|
318 |
if word_list[i].startswith("##"):
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
i < len(word_list) - 2 and word_list[i + 1] == "-"
|
324 |
-
): # Combine hyphenated words
|
325 |
result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
|
326 |
-
i += 2 # Skip the next two words
|
327 |
else:
|
|
|
|
|
328 |
result.append(word_list[i])
|
329 |
i += 1
|
330 |
return result
|
@@ -360,6 +498,7 @@ is losing territory in the east. Zelensky praised Japan's commitment
|
|
360 |
on Thursday, amid wider concerns that the next US President, who is
|
361 |
set to take office on Monday, could potentially reduce aid.
|
362 |
"""
|
|
|
363 |
if __name__ == "__main__":
|
364 |
with gr.Blocks() as demo:
|
365 |
gr.Markdown("### Highlight Matching Parts Between Two Texts")
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
+
|
6 |
import colorsys
|
7 |
import json
|
|
|
8 |
import re
|
9 |
|
10 |
import gradio as gr
|
11 |
import openai
|
|
|
12 |
from transformers import pipeline
|
13 |
|
14 |
+
from src.application.config import (
|
15 |
+
AZUREOPENAI_CLIENT,
|
16 |
+
ENTITY_BRIGHTNESS,
|
17 |
+
ENTITY_DARKEN_COLOR,
|
18 |
+
ENTITY_LIGHTEN_COLOR,
|
19 |
+
ENTITY_SATURATION,
|
20 |
+
GPT_ENTITY_MODEL,
|
|
|
|
|
|
|
|
|
21 |
)
|
22 |
|
23 |
+
ner_pipeline = pipeline("ner")
|
24 |
+
|
25 |
|
26 |
def extract_entities_gpt(
|
27 |
original_text,
|
28 |
compared_text,
|
29 |
+
text_generation_model=GPT_ENTITY_MODEL,
|
30 |
+
) -> str:
|
31 |
+
"""
|
32 |
+
Extracts entity pairs with significantly different meanings between
|
33 |
+
two texts using a GPT model.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
original_text (str): The original text.
|
37 |
+
compared_text (str): The paraphrased or compared text.
|
38 |
+
text_generation_model (str, optional): The GPT model
|
39 |
+
to use for entity extraction.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
str: The JSON-like string containing the extracted entity pairs,
|
43 |
+
or an empty string if an error occurs.
|
44 |
+
"""
|
45 |
+
|
46 |
+
# Construct the prompt for the GPT model.
|
47 |
prompt = f"""
|
48 |
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
49 |
Find entity pairs with significantly different meanings after paraphrasing.
|
|
|
75 |
{compared_text}
|
76 |
"""
|
77 |
|
|
|
78 |
# Generate text using the selected model
|
79 |
try:
|
80 |
+
# Send the prompt to the GPT model and get the response.
|
81 |
+
response = AZUREOPENAI_CLIENT.chat.completions.create(
|
82 |
model=text_generation_model,
|
83 |
messages=[{"role": "user", "content": prompt}],
|
84 |
)
|
85 |
|
86 |
+
# Extract the generated content from the response.
|
87 |
res = response.choices[0].message.content
|
88 |
|
89 |
except openai.OpenAIError as e:
|
|
|
93 |
return res
|
94 |
|
95 |
|
96 |
+
def read_json(json_string: str) -> list[list[str, str]]:
|
97 |
+
"""
|
98 |
+
Parses a JSON string and returns a list of unique entity pairs.
|
99 |
+
|
100 |
+
Args:
|
101 |
+
json_string (str): The JSON string to parse.
|
102 |
+
|
103 |
+
Returns:
|
104 |
+
List[List[str, str]]: A list of unique entity pairs,
|
105 |
+
or an empty list if parsing fails.
|
106 |
+
"""
|
107 |
try:
|
108 |
+
# Attempt to parse the JSON string into a Python object
|
109 |
entities = json.loads(json_string)
|
110 |
+
|
111 |
# Remove duplicates pair of entities
|
112 |
unique_entities = []
|
113 |
for inner_list in entities:
|
114 |
+
# Check if the current entity pair is already existed.
|
115 |
if inner_list not in unique_entities:
|
116 |
unique_entities.append(inner_list)
|
|
|
117 |
return unique_entities
|
118 |
|
119 |
except json.JSONDecodeError as e:
|
|
|
121 |
return []
|
122 |
|
123 |
|
124 |
+
def set_color_brightness(
|
125 |
+
hex_color: str,
|
126 |
+
brightness_factor: float = ENTITY_LIGHTEN_COLOR,
|
127 |
+
) -> str:
|
128 |
+
"""
|
129 |
+
Lightens a HEX color by increasing its brightness in HSV space.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
+
Args:
|
132 |
+
hex_color (str): The HEX color code (e.g., "#RRGGBB").
|
133 |
+
factor (float, optional): The factor by which to increase brightness.
|
134 |
|
135 |
+
Returns:
|
136 |
+
str: The lightened HEX color code.
|
137 |
+
"""
|
138 |
+
# Remove the '#' prefix if present.
|
139 |
hex_color = hex_color.lstrip("#")
|
140 |
+
|
141 |
+
# Convert the HEX color to RGB (red, green, blue) integers.
|
142 |
r, g, b = (
|
143 |
+
int(hex_color[0:2], 16), # Red component
|
144 |
+
int(hex_color[2:4], 16), # Green component
|
145 |
+
int(hex_color[4:6], 16), # Blue component
|
146 |
)
|
147 |
|
148 |
+
# Convert RGB to HSV (hue, saturation, value/brightness)
|
149 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
|
|
150 |
|
151 |
+
# Increase the brightness by the specified factor, but cap it at 1.0.
|
152 |
+
v = min(1.0, v * brightness_factor)
|
153 |
+
|
154 |
+
# Convert the modified HSV back to RGB
|
155 |
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
156 |
+
|
157 |
+
# Convert the RGB values back to a HEX color code.
|
158 |
return f"#{r:02x}{g:02x}{b:02x}"
|
159 |
|
160 |
|
161 |
+
def generate_colors(index: int, total_colors: int = 20) -> str:
|
162 |
+
"""
|
163 |
+
Generates a unique, evenly spaced color for each index using HSL.
|
164 |
|
165 |
+
Args:
|
166 |
+
index (int): The index for which to generate a color.
|
167 |
+
total_colors (int, optional): The total number of colors to
|
168 |
+
distribute evenly. Defaults to 20.
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
str: A HEX color code representing the generated color.
|
172 |
+
"""
|
173 |
+
# Calculate the hue value based on the index and total number of colors.
|
174 |
+
# This ensures even distribution of hues across the color spectrum.
|
175 |
hue = index / total_colors # Spread hues in range [0,1]
|
|
|
|
|
176 |
|
177 |
# Convert HSL to RGB
|
178 |
+
r, g, b = colorsys.hls_to_rgb(hue, ENTITY_SATURATION, ENTITY_BRIGHTNESS)
|
179 |
+
|
180 |
+
# Scale the RGB values from [0, 1] to [0, 255]
|
181 |
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
182 |
|
183 |
+
# Convert to hex
|
184 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
185 |
+
|
186 |
|
187 |
+
def assign_colors_to_entities(entities: list) -> list[dict]:
|
188 |
+
"""
|
189 |
+
Assigns unique colors to each entity pair in a list.
|
190 |
|
191 |
+
Args:
|
192 |
+
entities (list): A list of entity pairs,
|
193 |
+
where each pair is a list of two strings.
|
194 |
+
Example: [["entity1_original", "entity1_compared"]]
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
list: A list of dictionaries,
|
198 |
+
where each dictionary contains
|
199 |
+
- "color": the color of entity pair.
|
200 |
+
- "input": the original entity string.
|
201 |
+
- "source": the compared entity string.
|
202 |
+
"""
|
203 |
+
# Number of colors needed.
|
204 |
total_colors = len(entities)
|
205 |
+
|
206 |
+
# Assign colors to entities using their index.
|
207 |
entities_colors = []
|
208 |
for index, entity in enumerate(entities):
|
209 |
+
color = generate_colors(index, total_colors)
|
210 |
|
211 |
+
# Append color and index to entities_colors
|
212 |
entities_colors.append(
|
213 |
{"color": color, "input": entity[0], "source": entity[1]},
|
214 |
)
|
|
|
216 |
return entities_colors
|
217 |
|
218 |
|
219 |
+
def highlight_entities(text1: str, text2: str) -> list[dict]:
|
220 |
+
"""
|
221 |
+
Highlights entities with significant differences between
|
222 |
+
two texts by assigning them unique colors.
|
223 |
+
|
224 |
+
Args:
|
225 |
+
text1 (str): input text.
|
226 |
+
text2 (str): source text.
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
list: A list of dictionaries, where each dictionary
|
230 |
+
contains the highlighted entity information (color, input, source)
|
231 |
+
or None if no significant entities are found or an error occurs.
|
232 |
+
"""
|
233 |
if text1 is None or text2 is None:
|
234 |
return None
|
235 |
|
236 |
+
# Extract entities with significant differences using a GPT model.
|
237 |
entities_text = extract_entities_gpt(text1, text2)
|
238 |
|
239 |
+
# Clean up the extracted entities string by removing wrapping characters.
|
240 |
entities_text = entities_text.replace("```json", "").replace("```", "")
|
241 |
|
242 |
+
# Parse the cleaned entities string into a Python list of entity pairs.
|
243 |
entities = read_json(entities_text)
|
244 |
+
|
245 |
+
# If no significant entities are found, return None.
|
246 |
if len(entities) == 0:
|
247 |
return None
|
248 |
|
249 |
+
# Assign unique colors to the extracted entities.
|
250 |
entities_with_colors = assign_colors_to_entities(entities)
|
251 |
|
252 |
return entities_with_colors
|
253 |
|
254 |
|
255 |
+
def apply_highlight(
|
256 |
+
text: str,
|
257 |
+
entities_with_colors: list[dict],
|
258 |
+
key: str = "input",
|
259 |
+
count: int = 0,
|
260 |
+
) -> tuple[str, list[int]]:
|
261 |
+
"""
|
262 |
+
Applies highlighting to specified entities within a text,
|
263 |
+
assigning them unique colors and index labels.
|
264 |
+
|
265 |
+
Args:
|
266 |
+
text (str): The text to highlight.
|
267 |
+
entities_with_colors (list): A list of dictionaries,
|
268 |
+
where each dictionary represents an entity and its color.
|
269 |
+
key (str, optional): The key in the entity dictionary that
|
270 |
+
contains the entity text to highlight.
|
271 |
+
count (int, optional): An offset to add to the index labels.
|
272 |
+
|
273 |
+
Returns:
|
274 |
+
tuple:
|
275 |
+
- A tuple containing the highlighted text (str).
|
276 |
+
- A list of index positions (list).
|
277 |
+
"""
|
278 |
if entities_with_colors is None:
|
279 |
return text, []
|
280 |
|
281 |
+
# Start & end indices of highlighted entities.
|
282 |
all_starts = []
|
283 |
all_ends = []
|
284 |
highlighted_text = ""
|
285 |
temp_text = text
|
286 |
+
|
287 |
+
# Apply highlighting to each entity.
|
288 |
for index, entity in enumerate(entities_with_colors):
|
289 |
highlighted_text = ""
|
290 |
|
|
|
|
|
|
|
291 |
starts = []
|
292 |
ends = []
|
293 |
+
|
294 |
for m in re.finditer(
|
295 |
+
# Word boundaries (\b) and escape special characters
|
296 |
r"\b" + re.escape(entity[key]) + r"\b",
|
297 |
temp_text,
|
298 |
):
|
|
|
302 |
all_starts.extend(starts)
|
303 |
all_ends.extend(ends)
|
304 |
|
305 |
+
# Get the colors for each occurrence of the entity.
|
306 |
color = entities_with_colors[index]["color"]
|
307 |
+
|
308 |
+
# Lightened color for background text
|
309 |
+
entity_color = set_color_brightness(
|
310 |
color,
|
311 |
+
brightness_factor=ENTITY_LIGHTEN_COLOR,
|
312 |
+
)
|
313 |
+
# Darker color for background label (index)
|
314 |
+
label_color = set_color_brightness(
|
315 |
entity_color,
|
316 |
+
brightness_factor=ENTITY_DARKEN_COLOR,
|
317 |
+
)
|
318 |
|
319 |
+
# Apply highlighting to each occurrence of the entity.
|
320 |
prev_end = 0
|
321 |
for start, end in zip(starts, ends):
|
322 |
+
# Non-highlighted text before the entity.
|
323 |
highlighted_text += temp_text[prev_end:start]
|
324 |
|
325 |
+
# Create the index label with the specified color and style.
|
326 |
index_label = (
|
327 |
f'<span_style="background-color:{label_color};color:white;'
|
328 |
f"padding:1px_4px;border-radius:4px;font-size:12px;"
|
329 |
f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>' # noqa: E501
|
330 |
)
|
331 |
|
332 |
+
# Highlighted entity with the specified color and style.
|
333 |
highlighted_text += (
|
334 |
f'<span_style="background-color:{entity_color};color:black;'
|
335 |
f'border-radius:3px;font-size:14px;display:inline-block;">'
|
336 |
f"{index_label}{temp_text[start:end]}</span>"
|
337 |
)
|
338 |
prev_end = end
|
339 |
+
|
340 |
+
# Append any remaining text after the last entity.
|
341 |
highlighted_text += temp_text[prev_end:]
|
342 |
+
|
343 |
+
# Update the temporary text with the highlighted text.
|
344 |
temp_text = highlighted_text
|
345 |
|
346 |
if highlighted_text == "":
|
347 |
return text, []
|
348 |
+
|
349 |
+
# Get the index list of the highlighted text.
|
350 |
highlight_idx_list = get_index_list(highlighted_text)
|
351 |
return highlighted_text, highlight_idx_list
|
352 |
|
353 |
|
354 |
+
def get_index_list(highlighted_text: str) -> list[int]:
|
355 |
"""
|
356 |
+
Generates a list of indices of highlighted words within a text.
|
357 |
|
358 |
Args:
|
359 |
+
highlighted_text (str): The text containing highlighted words
|
360 |
+
wrapped in HTML-like span tags.
|
361 |
|
362 |
Returns:
|
363 |
+
list: A list of indices corresponding to the highlighted words.
|
364 |
+
An empty list if no highlighted words are found.
|
|
|
365 |
"""
|
366 |
highlighted_index = []
|
367 |
+
start_index = None
|
368 |
+
end_index = None
|
369 |
words = highlighted_text.split()
|
370 |
for index, word in enumerate(words):
|
371 |
+
# Check if the word starts with a highlighted word.
|
372 |
if word.startswith("<span_style"):
|
373 |
start_index = index
|
374 |
+
|
375 |
+
# Check if the word ends with a closing span tag
|
376 |
if word.endswith("</span>"):
|
377 |
end_index = index
|
378 |
+
if start_index is not None:
|
379 |
+
# Add the range of indices to the result list.
|
380 |
+
highlighted_index.extend(
|
381 |
+
list(
|
382 |
+
range(
|
383 |
+
start_index,
|
384 |
+
end_index + 1,
|
385 |
+
),
|
386 |
+
),
|
387 |
+
)
|
388 |
+
|
389 |
+
start_index = None
|
390 |
+
end_index = None
|
391 |
|
392 |
return highlighted_index
|
393 |
|
394 |
|
395 |
+
def extract_entities(text: str):
|
396 |
+
"""
|
397 |
+
Extracts named entities from the given text.
|
398 |
+
|
399 |
+
Args:
|
400 |
+
text (str): The input text to extract entities from.
|
401 |
+
|
402 |
+
Returns:
|
403 |
+
list: A list of unique extracted entities (string).
|
404 |
+
"""
|
405 |
+
# Apply the Named Entity Recognition (NER) pipeline to the input text.
|
406 |
output = ner_pipeline(text)
|
407 |
+
|
408 |
+
# Extract words from the NER pipeline output.
|
409 |
words = extract_words(output)
|
410 |
+
|
411 |
+
# Combine subwords into complete words.
|
412 |
words = combine_subwords(words)
|
413 |
|
414 |
+
# Append the entities if it's not a duplicate.
|
|
|
415 |
entities = []
|
416 |
for entity in words:
|
417 |
if entity not in entities:
|
|
|
420 |
return entities
|
421 |
|
422 |
|
423 |
+
def extract_words(entities: list[dict]) -> list[str]:
|
424 |
"""
|
425 |
Extracts the words from a list of entities.
|
426 |
|
427 |
Args:
|
428 |
+
entities (list): A list of entities,
|
429 |
+
where each entity is expected to be a dictionary
|
430 |
+
containing a "word" key.
|
431 |
|
432 |
Returns:
|
433 |
+
list[str]: A list of words extracted from the entities.
|
434 |
"""
|
435 |
words = []
|
436 |
for entity in entities:
|
|
|
443 |
Combines subwords (indicated by "##") with the preceding word in a list.
|
444 |
|
445 |
Args:
|
446 |
+
word_list (list): A list of words,
|
447 |
+
where subwords are prefixed with "##".
|
448 |
|
449 |
Returns:
|
450 |
+
list: A new list with subwords combined with their preceding words
|
451 |
+
and hyphenated words combined.
|
452 |
"""
|
453 |
result = []
|
454 |
i = 0
|
455 |
while i < len(word_list):
|
456 |
if word_list[i].startswith("##"):
|
457 |
+
# Remove "##" and append the remaining to the previous word
|
458 |
+
result[-1] += word_list[i][2:]
|
459 |
+
elif i < len(word_list) - 2 and word_list[i + 1] == "-":
|
460 |
+
# Combine the current word, the hyphen, and the next word.
|
|
|
|
|
461 |
result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
|
462 |
+
i += 2 # Skip the next two words (hyphen and the following word)
|
463 |
else:
|
464 |
+
# If neither a subword nor a hyphenated word,
|
465 |
+
# append the current word to the result list.
|
466 |
result.append(word_list[i])
|
467 |
i += 1
|
468 |
return result
|
|
|
498 |
on Thursday, amid wider concerns that the next US President, who is
|
499 |
set to take office on Monday, could potentially reduce aid.
|
500 |
"""
|
501 |
+
|
502 |
if __name__ == "__main__":
|
503 |
with gr.Blocks() as demo:
|
504 |
gr.Markdown("### Highlight Matching Parts Between Two Texts")
|
src/application/text/helper.py
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
import string
|
3 |
from collections import Counter
|
@@ -8,9 +13,18 @@ from nltk.util import ngrams
|
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
|
10 |
|
11 |
-
def clean_text(text):
|
12 |
-
"""
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
|
15 |
|
16 |
# Lowering text
|
@@ -22,23 +36,51 @@ def clean_text(text):
|
|
22 |
# Removing whitespace and newlines
|
23 |
text = re.sub(r"\s+", " ", text)
|
24 |
|
|
|
25 |
text.replace("£", " * ")
|
26 |
|
|
|
27 |
words = text.split()
|
28 |
-
|
|
|
|
|
29 |
|
30 |
return text
|
31 |
|
32 |
|
33 |
-
def remove_punctuation(text):
|
34 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
punctuation_without_dot = string.punctuation.replace(".", "")
|
|
|
|
|
36 |
translator = str.maketrans("", "", punctuation_without_dot)
|
|
|
|
|
37 |
return text.translate(translator)
|
38 |
|
39 |
|
40 |
def get_keywords(text, num_keywords=5):
|
41 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# Create a TF-IDF Vectorizer
|
44 |
vectorizer = TfidfVectorizer(stop_words="english")
|
@@ -142,41 +184,76 @@ def extract_important_phrases(
|
|
142 |
return important_phrases
|
143 |
|
144 |
|
145 |
-
def extract_equal_text(text1, text2):
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
text = text.lower()
|
148 |
text = text.translate(str.maketrans("", "", string.punctuation))
|
149 |
return text
|
150 |
|
|
|
151 |
splited_text1 = cleanup(text1).split()
|
152 |
splited_text2 = cleanup(text2).split()
|
153 |
|
|
|
154 |
s = SequenceMatcher(None, splited_text1, splited_text2)
|
155 |
|
156 |
equal_idx_1 = []
|
157 |
equal_idx_2 = []
|
|
|
|
|
158 |
text1 = text1.split()
|
159 |
text2 = text2.split()
|
160 |
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
161 |
if tag == "equal":
|
|
|
|
|
162 |
equal_idx_1.append({"start": i1, "end": i2})
|
163 |
equal_idx_2.append({"start": j1, "end": j2})
|
164 |
-
|
165 |
-
|
|
|
166 |
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
|
167 |
# f'{subtext_1!r:>55} --> {subtext_2!r}')
|
168 |
return equal_idx_1, equal_idx_2
|
169 |
|
170 |
|
171 |
-
def connect_consecutive_indexes(nums):
|
172 |
"""
|
173 |
Connects consecutive integers in a list.
|
174 |
|
175 |
Args:
|
176 |
-
nums: A list of integers.
|
177 |
|
178 |
Returns:
|
179 |
-
A list of lists,
|
|
|
|
|
180 |
"""
|
181 |
|
182 |
if not nums: # Handle empty input
|
@@ -187,12 +264,15 @@ def connect_consecutive_indexes(nums):
|
|
187 |
end = nums[0]
|
188 |
|
189 |
for i in range(1, len(nums)):
|
|
|
190 |
if nums[i] == end + 1:
|
191 |
-
end = nums[i]
|
192 |
else:
|
|
|
193 |
result.append([start, end])
|
194 |
start = nums[i]
|
195 |
end = nums[i]
|
196 |
|
197 |
-
|
|
|
198 |
return result
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
+
|
6 |
import re
|
7 |
import string
|
8 |
from collections import Counter
|
|
|
13 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
14 |
|
15 |
|
16 |
+
def clean_text(text: str) -> str:
|
17 |
+
"""
|
18 |
+
Cleans and preprocesses a given text string.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
text (str): The input text to be cleaned.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
str: The cleaned and preprocessed text, containing the first 18 words.
|
25 |
+
"""
|
26 |
+
# Define a set of punctuation characters to exclude,
|
27 |
+
# exclude comma and period due to numbers
|
28 |
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
|
29 |
|
30 |
# Lowering text
|
|
|
36 |
# Removing whitespace and newlines
|
37 |
text = re.sub(r"\s+", " ", text)
|
38 |
|
39 |
+
# Replace £ with * because Google search doesn't recognize £
|
40 |
text.replace("£", " * ")
|
41 |
|
42 |
+
# Split the text into a list of words.
|
43 |
words = text.split()
|
44 |
+
|
45 |
+
# Join the first 18 words back into a string
|
46 |
+
text = " ".join(words[:18]) # TODO: consider another number
|
47 |
|
48 |
return text
|
49 |
|
50 |
|
51 |
+
def remove_punctuation(text: str) -> str:
|
52 |
+
"""
|
53 |
+
Removes all punctuation characters from a string, except for periods (.).
|
54 |
+
|
55 |
+
Args:
|
56 |
+
text (str): The input string.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
str: The string with all punctuation characters removed,
|
60 |
+
except for periods.
|
61 |
+
"""
|
62 |
+
# Create a string containing all punctuation characters,
|
63 |
+
# except for periods.
|
64 |
punctuation_without_dot = string.punctuation.replace(".", "")
|
65 |
+
|
66 |
+
# Create a translation table to remove the specified punctuation chars.
|
67 |
translator = str.maketrans("", "", punctuation_without_dot)
|
68 |
+
|
69 |
+
# Apply the translation table to the input text and return the result.
|
70 |
return text.translate(translator)
|
71 |
|
72 |
|
73 |
def get_keywords(text, num_keywords=5):
|
74 |
+
"""
|
75 |
+
Extracts the top k keywords from a document using the TF-IDF method.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
text (str): The input text from which to extract keywords.
|
79 |
+
num_keywords (int, optional): The number of top keywords to return.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
list: A list of the top keywords extracted from the text.
|
83 |
+
"""
|
84 |
|
85 |
# Create a TF-IDF Vectorizer
|
86 |
vectorizer = TfidfVectorizer(stop_words="english")
|
|
|
184 |
return important_phrases
|
185 |
|
186 |
|
187 |
+
def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
|
188 |
+
"""
|
189 |
+
Extracts the indices of equal text segments between two strings.
|
190 |
+
|
191 |
+
Args:
|
192 |
+
text1 (str): The first input string.
|
193 |
+
text2 (str): The second input string.
|
194 |
+
|
195 |
+
Returns:
|
196 |
+
tuple[
|
197 |
+
list[dict{"start": int, "end": int}],
|
198 |
+
list[dict{"start": int, "end": int}]
|
199 |
+
]
|
200 |
+
- list: the start and end indices of equal segments in text1.
|
201 |
+
- list: the start and end indices of equal segments in text2.
|
202 |
+
"""
|
203 |
+
|
204 |
+
def cleanup(text: str) -> str:
|
205 |
+
"""
|
206 |
+
Cleans up a text string by converting to lowercase
|
207 |
+
and removing punctuation.
|
208 |
+
|
209 |
+
Args:
|
210 |
+
text (str): The input text.
|
211 |
+
|
212 |
+
Returns:
|
213 |
+
str: The cleaned text.
|
214 |
+
"""
|
215 |
text = text.lower()
|
216 |
text = text.translate(str.maketrans("", "", string.punctuation))
|
217 |
return text
|
218 |
|
219 |
+
# Clean and split the input texts into lists of words.
|
220 |
splited_text1 = cleanup(text1).split()
|
221 |
splited_text2 = cleanup(text2).split()
|
222 |
|
223 |
+
# Create a SequenceMatcher object to compare the cleaned word lists.
|
224 |
s = SequenceMatcher(None, splited_text1, splited_text2)
|
225 |
|
226 |
equal_idx_1 = []
|
227 |
equal_idx_2 = []
|
228 |
+
|
229 |
+
# Split the original texts into lists of words (without cleaning).
|
230 |
text1 = text1.split()
|
231 |
text2 = text2.split()
|
232 |
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
233 |
if tag == "equal":
|
234 |
+
# Append the start and end indices of the equal segment
|
235 |
+
# to the respective lists.
|
236 |
equal_idx_1.append({"start": i1, "end": i2})
|
237 |
equal_idx_2.append({"start": j1, "end": j2})
|
238 |
+
|
239 |
+
# subtext_1 = " ".join(text1[i1:i2])
|
240 |
+
# subtext_2 = " ".join(text2[j1:j2])
|
241 |
# print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
|
242 |
# f'{subtext_1!r:>55} --> {subtext_2!r}')
|
243 |
return equal_idx_1, equal_idx_2
|
244 |
|
245 |
|
246 |
+
def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
|
247 |
"""
|
248 |
Connects consecutive integers in a list.
|
249 |
|
250 |
Args:
|
251 |
+
nums (list): A list of integers.
|
252 |
|
253 |
Returns:
|
254 |
+
list: A list of lists,
|
255 |
+
where each inner list represents a consecutive range.
|
256 |
+
For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
|
257 |
"""
|
258 |
|
259 |
if not nums: # Handle empty input
|
|
|
264 |
end = nums[0]
|
265 |
|
266 |
for i in range(1, len(nums)):
|
267 |
+
# Check if the current number is consecutive to the previous end.
|
268 |
if nums[i] == end + 1:
|
269 |
+
end = nums[i] # Extend the current range.
|
270 |
else:
|
271 |
+
# Add the current range to the result and start a new range.
|
272 |
result.append([start, end])
|
273 |
start = nums[i]
|
274 |
end = nums[i]
|
275 |
|
276 |
+
# Add the last range to the result.
|
277 |
+
result.append([start, end])
|
278 |
return result
|
src/application/text/highlight_text.py
DELETED
@@ -1,202 +0,0 @@
|
|
1 |
-
import colorsys
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
|
5 |
-
|
6 |
-
def lighten_color(hex_color, factor=1.8):
|
7 |
-
"""Lightens a HEX color by increasing its brightness in HSV space."""
|
8 |
-
|
9 |
-
hex_color = hex_color.lstrip("#")
|
10 |
-
r, g, b = (
|
11 |
-
int(hex_color[0:2], 16),
|
12 |
-
int(hex_color[2:4], 16),
|
13 |
-
int(hex_color[4:6], 16),
|
14 |
-
)
|
15 |
-
|
16 |
-
# Convert to HSV
|
17 |
-
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
18 |
-
v = min(1.0, v * factor) # Increase brightness
|
19 |
-
|
20 |
-
# Convert back to HEX
|
21 |
-
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
22 |
-
return f"#{r:02x}{g:02x}{b:02x}"
|
23 |
-
|
24 |
-
|
25 |
-
def darken_color(hex_color, factor=0.7):
|
26 |
-
"""Darkens a hex color by reducing its brightness in the HSV space."""
|
27 |
-
|
28 |
-
hex_color = hex_color.lstrip("#")
|
29 |
-
r, g, b = (
|
30 |
-
int(hex_color[0:2], 16),
|
31 |
-
int(hex_color[2:4], 16),
|
32 |
-
int(hex_color[4:6], 16),
|
33 |
-
)
|
34 |
-
|
35 |
-
# Convert to HSV to adjust brightness
|
36 |
-
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
37 |
-
v = max(0, v * factor) # Reduce brightness
|
38 |
-
|
39 |
-
# Convert back to HEX
|
40 |
-
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
41 |
-
return f"#{r:02x}{g:02x}{b:02x}"
|
42 |
-
|
43 |
-
|
44 |
-
# Generate unique colors for pairs
|
45 |
-
def generate_color(index, total_colors=20):
|
46 |
-
"""Generates a unique, evenly spaced color for each index using HSL."""
|
47 |
-
|
48 |
-
hue = index / total_colors # Spread hues in range [0,1]
|
49 |
-
saturation = 0.65 # Keep colors vivid
|
50 |
-
lightness = 0.75 # Balanced brightness
|
51 |
-
|
52 |
-
# Convert HSL to RGB
|
53 |
-
r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
|
54 |
-
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
55 |
-
|
56 |
-
return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
|
57 |
-
|
58 |
-
|
59 |
-
def highlight_pairs(text1, text2):
|
60 |
-
"""Highlight matching pairs between two texts"""
|
61 |
-
# Predefined matching pairs
|
62 |
-
match_pairs = [
|
63 |
-
{
|
64 |
-
"index": 1,
|
65 |
-
"text1": "deep learning",
|
66 |
-
"start1": 13,
|
67 |
-
"end1": 26,
|
68 |
-
"text2": "deep learning",
|
69 |
-
"start2": 12,
|
70 |
-
"end2": 25,
|
71 |
-
},
|
72 |
-
{
|
73 |
-
"index": 2,
|
74 |
-
"text1": "neural networks",
|
75 |
-
"start1": 56,
|
76 |
-
"end1": 71,
|
77 |
-
"text2": "neural networks",
|
78 |
-
"start2": 68,
|
79 |
-
"end2": 83,
|
80 |
-
},
|
81 |
-
{
|
82 |
-
"index": 3,
|
83 |
-
"text1": "AI research",
|
84 |
-
"start1": 86,
|
85 |
-
"end1": 97,
|
86 |
-
"text2": "AI research",
|
87 |
-
"start2": 55,
|
88 |
-
"end2": 66,
|
89 |
-
},
|
90 |
-
]
|
91 |
-
|
92 |
-
# Assign unique colors to each index
|
93 |
-
pair_colors = {
|
94 |
-
pair["index"]: generate_color(
|
95 |
-
pair["index"],
|
96 |
-
total_colors=len(match_pairs),
|
97 |
-
)
|
98 |
-
for pair in match_pairs
|
99 |
-
}
|
100 |
-
|
101 |
-
def apply_highlight(
|
102 |
-
text,
|
103 |
-
pairs,
|
104 |
-
key_start,
|
105 |
-
key_end,
|
106 |
-
key_index,
|
107 |
-
pair_colors,
|
108 |
-
):
|
109 |
-
highlighted_text = ""
|
110 |
-
prev_end = 0
|
111 |
-
|
112 |
-
for pair in sorted(pairs, key=lambda x: x[key_start]):
|
113 |
-
start, end, index = pair[key_start], pair[key_end], pair[key_index]
|
114 |
-
color = pair_colors.get(
|
115 |
-
index,
|
116 |
-
"#ddd",
|
117 |
-
) # Default color if not found
|
118 |
-
color = lighten_color(
|
119 |
-
color,
|
120 |
-
factor=2.2,
|
121 |
-
) # Lightened color for background text
|
122 |
-
label_color = darken_color(
|
123 |
-
color,
|
124 |
-
factor=0.7,
|
125 |
-
) # Make label color darker
|
126 |
-
|
127 |
-
# Style the index as a label
|
128 |
-
index_label = (
|
129 |
-
f'<span style="background-color:{label_color}; color:white; '
|
130 |
-
f"padding:1px 4px; border-radius:4px; font-size:12px; "
|
131 |
-
f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>' # noqa: E501
|
132 |
-
)
|
133 |
-
|
134 |
-
# Append non-highlighted text
|
135 |
-
highlighted_text += text[prev_end:start]
|
136 |
-
# Append highlighted text with index label
|
137 |
-
highlighted_text += (
|
138 |
-
f'<span style="background-color:{color}; '
|
139 |
-
f'border-radius:3px; font-size:14px; display:inline-block;">'
|
140 |
-
f"{index_label} {text[start:end]}</span>"
|
141 |
-
)
|
142 |
-
prev_end = end
|
143 |
-
|
144 |
-
# Append remaining text
|
145 |
-
highlighted_text += text[prev_end:]
|
146 |
-
return highlighted_text
|
147 |
-
|
148 |
-
# Apply highlighting to both texts using the global MATCH_PAIRS
|
149 |
-
highlighted_text1 = apply_highlight(
|
150 |
-
text1,
|
151 |
-
match_pairs,
|
152 |
-
"start1",
|
153 |
-
"end1",
|
154 |
-
"index",
|
155 |
-
pair_colors,
|
156 |
-
)
|
157 |
-
highlighted_text2 = apply_highlight(
|
158 |
-
text2,
|
159 |
-
match_pairs,
|
160 |
-
"start2",
|
161 |
-
"end2",
|
162 |
-
"index",
|
163 |
-
pair_colors,
|
164 |
-
)
|
165 |
-
|
166 |
-
return highlighted_text1, highlighted_text2
|
167 |
-
|
168 |
-
|
169 |
-
if __name__ == "__main__":
|
170 |
-
# Create Gradio Interface
|
171 |
-
text1 = ""
|
172 |
-
|
173 |
-
with gr.Blocks() as demo:
|
174 |
-
gr.Markdown("### Highlight Matching Parts Between Two texts")
|
175 |
-
text1_input = gr.Textbox(
|
176 |
-
label="Text 1",
|
177 |
-
lines=5,
|
178 |
-
value="""
|
179 |
-
The field of deep learning is advancing rapidly.
|
180 |
-
Modern neural networks are improving AI research significantly.
|
181 |
-
""",
|
182 |
-
)
|
183 |
-
text2_input = gr.Textbox(
|
184 |
-
label="Text 2",
|
185 |
-
lines=5,
|
186 |
-
value="""
|
187 |
-
Advances in deep learning have led to breakthroughs in AI research.
|
188 |
-
Neural networks are at the core of these innovations",
|
189 |
-
""",
|
190 |
-
)
|
191 |
-
output1 = gr.HTML()
|
192 |
-
output2 = gr.HTML()
|
193 |
-
submit_button = gr.Button("Highlight Matches")
|
194 |
-
|
195 |
-
submit_button.click(
|
196 |
-
fn=highlight_pairs,
|
197 |
-
inputs=[text1_input, text2_input],
|
198 |
-
outputs=[output1, output2],
|
199 |
-
)
|
200 |
-
|
201 |
-
# Launch the Gradio app
|
202 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/application/text/model_detection.py
CHANGED
@@ -1,44 +1,24 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
-
import
|
4 |
-
from
|
5 |
-
from openai import (
|
6 |
-
AzureOpenAI,
|
7 |
-
OpenAIError,
|
8 |
-
)
|
9 |
-
from sentence_transformers import (
|
10 |
-
SentenceTransformer,
|
11 |
-
util,
|
12 |
-
)
|
13 |
from transformers import pipeline
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
24 |
)
|
25 |
|
26 |
-
# TODO: move to a config file
|
27 |
-
# AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
|
28 |
-
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
|
29 |
-
|
30 |
-
MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
|
31 |
-
HUMAN = "HUMAN"
|
32 |
-
MACHINE = "MACHINE"
|
33 |
-
UNKNOWN = "UNKNOWN"
|
34 |
-
PARAPHRASE = "PARAPHRASE"
|
35 |
-
NON_PARAPHRASE = "NON_PARAPHRASE"
|
36 |
-
|
37 |
-
# load the embedding model
|
38 |
-
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
39 |
-
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
40 |
-
PARAPHASE_MODEL.to(DEVICE)
|
41 |
-
|
42 |
|
43 |
def detect_text_by_ai_model(
|
44 |
input_text: str,
|
@@ -51,29 +31,43 @@ def detect_text_by_ai_model(
|
|
51 |
|
52 |
Detects if text is human or machine generated.
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
Returns:
|
55 |
tuple: (label, confidence_score)
|
56 |
where label is HUMAN or MACHINE.
|
57 |
"""
|
58 |
try:
|
|
|
59 |
pipe = pipeline(
|
60 |
"text-classification",
|
61 |
model=model,
|
62 |
tokenizer=model,
|
63 |
-
max_length=max_length,
|
64 |
truncation=True,
|
65 |
device_map="auto", # good for GPU usage
|
66 |
)
|
|
|
|
|
67 |
input_text = input_text.replace("<br>", " ")
|
|
|
|
|
68 |
result = pipe(input_text)[0]
|
69 |
confidence_score = result["score"]
|
|
|
|
|
70 |
if result["label"] == MODEL_HUMAN_LABEL[model]:
|
71 |
label = HUMAN
|
72 |
else:
|
73 |
label = MACHINE
|
74 |
generated_model, _ = predict_generation_model(input_text)
|
75 |
label += f"<br>({generated_model})"
|
|
|
76 |
return label, confidence_score
|
|
|
77 |
except Exception as e: # Add exception handling
|
78 |
print(f"Error in Roberta model inference: {e}")
|
79 |
return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
|
@@ -82,20 +76,31 @@ def detect_text_by_ai_model(
|
|
82 |
def predict_generation_model(text: str) -> tuple[str, float]:
|
83 |
"""
|
84 |
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
|
85 |
-
|
|
|
|
|
|
|
86 |
|
87 |
Returns:
|
88 |
tuple: (label, confidence_score)
|
89 |
-
|
|
|
90 |
"""
|
91 |
best_similarity = 0
|
92 |
-
best_model =
|
93 |
-
|
94 |
-
for model in
|
|
|
95 |
paraphrased_text = paraphrase_by_AI(text, model)
|
|
|
|
|
96 |
if paraphrased_text is None:
|
97 |
continue
|
|
|
|
|
98 |
similarity = measure_text_similarity(text, paraphrased_text)
|
|
|
|
|
99 |
if similarity > best_similarity:
|
100 |
best_similarity = similarity
|
101 |
best_model = model
|
@@ -105,10 +110,14 @@ def predict_generation_model(text: str) -> tuple[str, float]:
|
|
105 |
|
106 |
def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
|
107 |
"""
|
108 |
-
|
|
|
|
|
|
|
|
|
109 |
|
110 |
Returns:
|
111 |
-
str:
|
112 |
"""
|
113 |
|
114 |
prompt = f"""
|
@@ -116,18 +125,19 @@ Paraphrase the following news, only output the paraphrased text:
|
|
116 |
{input_text}
|
117 |
"""
|
118 |
try:
|
119 |
-
response =
|
120 |
model=model,
|
121 |
messages=[
|
122 |
{"role": "user", "content": prompt},
|
123 |
],
|
124 |
-
# max_tokens=100,
|
125 |
-
# temperature=0.7,
|
126 |
-
# top_p=0.9,
|
127 |
-
# n=1,
|
128 |
)
|
129 |
paraphrased_text = response.choices[0].message.content
|
130 |
return paraphrased_text
|
|
|
131 |
except OpenAIError as e: # Add exception handling
|
132 |
print(f"Error in AI model inference: {e}")
|
133 |
return None
|
@@ -135,18 +145,24 @@ Paraphrase the following news, only output the paraphrased text:
|
|
135 |
|
136 |
def measure_text_similarity(text1: str, text2: str) -> float:
|
137 |
"""
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
Returns:
|
141 |
-
float:
|
142 |
"""
|
143 |
-
|
|
|
144 |
text1,
|
145 |
convert_to_tensor=True,
|
146 |
device=DEVICE,
|
147 |
show_progress_bar=False,
|
148 |
)
|
149 |
-
embeddings2 =
|
150 |
text2,
|
151 |
convert_to_tensor=True,
|
152 |
device=DEVICE,
|
@@ -155,5 +171,4 @@ def measure_text_similarity(text1: str, text2: str) -> float:
|
|
155 |
|
156 |
# Compute cosine similarity matrix
|
157 |
similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
158 |
-
print(similarity[0][0])
|
159 |
return similarity[0][0]
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
|
6 |
+
from openai import OpenAIError
|
7 |
+
from sentence_transformers import util
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from transformers import pipeline
|
9 |
|
10 |
+
from src.application.config import (
|
11 |
+
AI_TEXT_DECTECTION_MODEL,
|
12 |
+
AZUREOPENAI_CLIENT,
|
13 |
+
DEVICE,
|
14 |
+
GPT_PARAPHRASE_MODELS,
|
15 |
+
HUMAN,
|
16 |
+
MACHINE,
|
17 |
+
MODEL_HUMAN_LABEL,
|
18 |
+
PARAPHRASE_MODEL,
|
19 |
+
UNKNOWN,
|
20 |
)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def detect_text_by_ai_model(
|
24 |
input_text: str,
|
|
|
31 |
|
32 |
Detects if text is human or machine generated.
|
33 |
|
34 |
+
Args:
|
35 |
+
input_text (str): The text to be classified.
|
36 |
+
model (str, optional): The name of the AI text detection model.
|
37 |
+
max_length (int, optional): The maximum length of the input text.
|
38 |
+
|
39 |
Returns:
|
40 |
tuple: (label, confidence_score)
|
41 |
where label is HUMAN or MACHINE.
|
42 |
"""
|
43 |
try:
|
44 |
+
# Create a text classification pipeline using the specified model.
|
45 |
pipe = pipeline(
|
46 |
"text-classification",
|
47 |
model=model,
|
48 |
tokenizer=model,
|
49 |
+
max_length=max_length, # TODO: consider: removal
|
50 |
truncation=True,
|
51 |
device_map="auto", # good for GPU usage
|
52 |
)
|
53 |
+
|
54 |
+
# Replace HTML line breaks with spaces to improve processing.
|
55 |
input_text = input_text.replace("<br>", " ")
|
56 |
+
|
57 |
+
# Perform text classification using the pipeline.
|
58 |
result = pipe(input_text)[0]
|
59 |
confidence_score = result["score"]
|
60 |
+
|
61 |
+
# Determine the label based on the model's prediction.
|
62 |
if result["label"] == MODEL_HUMAN_LABEL[model]:
|
63 |
label = HUMAN
|
64 |
else:
|
65 |
label = MACHINE
|
66 |
generated_model, _ = predict_generation_model(input_text)
|
67 |
label += f"<br>({generated_model})"
|
68 |
+
|
69 |
return label, confidence_score
|
70 |
+
|
71 |
except Exception as e: # Add exception handling
|
72 |
print(f"Error in Roberta model inference: {e}")
|
73 |
return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
|
|
|
76 |
def predict_generation_model(text: str) -> tuple[str, float]:
|
77 |
"""
|
78 |
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
|
79 |
+
Compares the input text against the paraphrased text by the models.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
text (str): The input text to be analyzed.
|
83 |
|
84 |
Returns:
|
85 |
tuple: (label, confidence_score)
|
86 |
+
where label is gpt-4o or gpt-4o-mini,
|
87 |
+
and confidence_score is the highest similarity.
|
88 |
"""
|
89 |
best_similarity = 0
|
90 |
+
best_model = GPT_PARAPHRASE_MODELS[0]
|
91 |
+
|
92 |
+
for model in GPT_PARAPHRASE_MODELS:
|
93 |
+
# Generate paraphrased text using the current model.
|
94 |
paraphrased_text = paraphrase_by_AI(text, model)
|
95 |
+
|
96 |
+
# Skip to the next model if paraphrasing fails (returns None).
|
97 |
if paraphrased_text is None:
|
98 |
continue
|
99 |
+
|
100 |
+
# Similarity between the original text and the paraphrased text.
|
101 |
similarity = measure_text_similarity(text, paraphrased_text)
|
102 |
+
|
103 |
+
# Update the best similarity
|
104 |
if similarity > best_similarity:
|
105 |
best_similarity = similarity
|
106 |
best_model = model
|
|
|
110 |
|
111 |
def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
|
112 |
"""
|
113 |
+
Paraphrases text using a given AI model.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
input_text (str): The text to be paraphrased.
|
117 |
+
model (str, optional): The AI model to use for paraphrasing.
|
118 |
|
119 |
Returns:
|
120 |
+
str: The paraphrased text, or None if an error occurs.
|
121 |
"""
|
122 |
|
123 |
prompt = f"""
|
|
|
125 |
{input_text}
|
126 |
"""
|
127 |
try:
|
128 |
+
response = AZUREOPENAI_CLIENT.chat.completions.create(
|
129 |
model=model,
|
130 |
messages=[
|
131 |
{"role": "user", "content": prompt},
|
132 |
],
|
133 |
+
# max_tokens=100, # Limit the number of tokens in the response.
|
134 |
+
# temperature=0.7, # Control the randomness of the response.
|
135 |
+
# top_p=0.9, # Control the nucleus sampling.
|
136 |
+
# n=1, # Generate multiple responses.
|
137 |
)
|
138 |
paraphrased_text = response.choices[0].message.content
|
139 |
return paraphrased_text
|
140 |
+
|
141 |
except OpenAIError as e: # Add exception handling
|
142 |
print(f"Error in AI model inference: {e}")
|
143 |
return None
|
|
|
145 |
|
146 |
def measure_text_similarity(text1: str, text2: str) -> float:
|
147 |
"""
|
148 |
+
Measures the similarity between two texts
|
149 |
+
using cosine similarity of their sentence embeddings.
|
150 |
+
|
151 |
+
Args:
|
152 |
+
text1 (str): The first text string.
|
153 |
+
text2 (str): The second text string.
|
154 |
|
155 |
Returns:
|
156 |
+
float: The cosine similarity score between the two texts.
|
157 |
"""
|
158 |
+
# Generate sentence embeddings
|
159 |
+
embeddings1 = PARAPHRASE_MODEL.encode(
|
160 |
text1,
|
161 |
convert_to_tensor=True,
|
162 |
device=DEVICE,
|
163 |
show_progress_bar=False,
|
164 |
)
|
165 |
+
embeddings2 = PARAPHRASE_MODEL.encode(
|
166 |
text2,
|
167 |
convert_to_tensor=True,
|
168 |
device=DEVICE,
|
|
|
171 |
|
172 |
# Compute cosine similarity matrix
|
173 |
similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
|
|
174 |
return similarity[0][0]
|
src/application/text/preprocessing.py
CHANGED
@@ -1,46 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
3 |
|
4 |
-
|
|
|
5 |
"""
|
6 |
-
Splits input text into sentences by newlines
|
|
|
7 |
|
8 |
Args:
|
9 |
-
input_text: The input text as a string.
|
10 |
|
11 |
Returns:
|
12 |
-
A list of sentences.
|
|
|
13 |
"""
|
14 |
if not isinstance(input_text, str):
|
15 |
return []
|
16 |
|
|
|
|
|
17 |
paragraphs = input_text.splitlines(keepends=True)
|
18 |
sentences = []
|
19 |
for paragraph in paragraphs:
|
|
|
20 |
paragraph = paragraph.strip()
|
|
|
21 |
if paragraph and paragraph != "\n":
|
|
|
22 |
sentences.extend(sent_tokenize(paragraph))
|
|
|
23 |
return sentences
|
24 |
|
25 |
|
26 |
-
def split_into_paragraphs(input_text):
|
27 |
"""
|
28 |
-
Splits input text into
|
29 |
|
30 |
Args:
|
31 |
-
input_text: The input text as a string.
|
32 |
|
33 |
Returns:
|
34 |
-
A list of
|
|
|
35 |
"""
|
36 |
if not isinstance(input_text, str):
|
37 |
return []
|
38 |
|
|
|
|
|
39 |
paragraphs = input_text.splitlines(keepends=True)
|
40 |
out_paragraphs = []
|
|
|
41 |
for paragraph in paragraphs:
|
|
|
42 |
paragraph = paragraph.strip()
|
|
|
43 |
if paragraph and paragraph != "\n":
|
|
|
44 |
out_paragraphs.append(paragraph)
|
45 |
-
|
46 |
-
return out_paragraphs
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
+
|
6 |
from nltk.tokenize import sent_tokenize
|
7 |
|
8 |
|
9 |
+
# TODO: consider moving to helpers
|
10 |
+
def split_into_sentences(input_text: str) -> list[str]:
|
11 |
"""
|
12 |
+
Splits input text into sentences by newlines
|
13 |
+
and then tokenizes each paragraph into sentences.
|
14 |
|
15 |
Args:
|
16 |
+
input_text (str): The input text as a string.
|
17 |
|
18 |
Returns:
|
19 |
+
list: A list of sentences.
|
20 |
+
Returns an empty list if input is not a string.
|
21 |
"""
|
22 |
if not isinstance(input_text, str):
|
23 |
return []
|
24 |
|
25 |
+
# Split the input text into paragraphs based on newline characters,
|
26 |
+
# keeping the newline characters.
|
27 |
paragraphs = input_text.splitlines(keepends=True)
|
28 |
sentences = []
|
29 |
for paragraph in paragraphs:
|
30 |
+
# Remove leading/trailing whitespace
|
31 |
paragraph = paragraph.strip()
|
32 |
+
|
33 |
if paragraph and paragraph != "\n":
|
34 |
+
# Tokenize the paragraph into sentences
|
35 |
sentences.extend(sent_tokenize(paragraph))
|
36 |
+
|
37 |
return sentences
|
38 |
|
39 |
|
40 |
+
def split_into_paragraphs(input_text: str) -> list[str]:
|
41 |
"""
|
42 |
+
Splits input text into paragraphs based on newline characters.
|
43 |
|
44 |
Args:
|
45 |
+
input_text (str): The input text as a string.
|
46 |
|
47 |
Returns:
|
48 |
+
list: A list of paragraphs.
|
49 |
+
Returns an empty list if input is not a string.
|
50 |
"""
|
51 |
if not isinstance(input_text, str):
|
52 |
return []
|
53 |
|
54 |
+
# Split the input text into paragraphs based on newline characters,
|
55 |
+
# keeping the newline characters.
|
56 |
paragraphs = input_text.splitlines(keepends=True)
|
57 |
out_paragraphs = []
|
58 |
+
|
59 |
for paragraph in paragraphs:
|
60 |
+
# Remove leading/trailing whitespace
|
61 |
paragraph = paragraph.strip()
|
62 |
+
|
63 |
if paragraph and paragraph != "\n":
|
64 |
+
# Append the cleaned paragraph to the output list.
|
65 |
out_paragraphs.append(paragraph)
|
66 |
+
|
67 |
+
return out_paragraphs
|
src/application/text/search.py
CHANGED
@@ -1,38 +1,50 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
2 |
import string
|
3 |
from collections import Counter
|
4 |
|
5 |
import requests
|
6 |
-
from dotenv import load_dotenv
|
7 |
from nltk.corpus import stopwords
|
8 |
from nltk.tokenize import word_tokenize
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from src.application.text.entity import extract_entities
|
12 |
|
13 |
-
load_dotenv()
|
14 |
-
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
15 |
-
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
|
16 |
-
|
17 |
|
18 |
def search_by_google(
|
19 |
query,
|
20 |
-
num_results=
|
21 |
is_exact_terms=False,
|
22 |
) -> dict:
|
23 |
"""
|
24 |
-
|
25 |
|
26 |
Args:
|
27 |
-
query: The search query.
|
28 |
-
|
29 |
-
|
|
|
|
|
30 |
|
31 |
Returns:
|
32 |
-
|
|
|
33 |
"""
|
34 |
|
35 |
-
url = "https://www.googleapis.com/customsearch/v1"
|
36 |
params = {
|
37 |
"key": GOOGLE_API_KEY,
|
38 |
"cx": SEARCH_ENGINE_ID,
|
@@ -43,7 +55,7 @@ def search_by_google(
|
|
43 |
else:
|
44 |
params["q"] = query.replace('"', "")
|
45 |
|
46 |
-
response = requests.get(
|
47 |
if response.status_code == 200:
|
48 |
return response.json()
|
49 |
else:
|
@@ -51,26 +63,35 @@ def search_by_google(
|
|
51 |
return None
|
52 |
|
53 |
|
54 |
-
def get_most_frequent_words(
|
|
|
|
|
|
|
55 |
"""
|
56 |
-
|
57 |
-
|
58 |
|
59 |
Args:
|
60 |
-
input_text: The
|
61 |
-
number_word: The number of
|
62 |
|
63 |
Returns:
|
64 |
-
A
|
65 |
-
Returns an empty list if input is not a string or is empty.
|
66 |
"""
|
|
|
67 |
if not isinstance(input_text, str) or not input_text:
|
68 |
-
return
|
|
|
|
|
|
|
69 |
|
70 |
-
words
|
|
|
71 |
|
72 |
-
|
73 |
-
punctuation = set(string.punctuation)
|
|
|
|
|
74 |
filtered_words = [
|
75 |
word
|
76 |
for word in words
|
@@ -78,32 +99,40 @@ def get_most_frequent_words(input_text, number_word=32):
|
|
78 |
and word not in stop_words
|
79 |
and word not in punctuation
|
80 |
]
|
|
|
|
|
81 |
word_frequencies = Counter(filtered_words)
|
|
|
|
|
82 |
top_words = word_frequencies.most_common(number_word)
|
83 |
|
84 |
for top_word in top_words:
|
85 |
words.append(top_word[0])
|
86 |
|
87 |
-
|
88 |
-
|
|
|
89 |
else:
|
90 |
search_phrase = " ".join(words[:number_word])
|
91 |
|
92 |
return search_phrase
|
93 |
|
94 |
|
95 |
-
def get_chunk(
|
|
|
|
|
|
|
|
|
96 |
"""
|
97 |
-
Splits the input text into chunks of a specified
|
98 |
|
99 |
Args:
|
100 |
-
input_text: The
|
101 |
-
|
102 |
-
|
103 |
|
104 |
Returns:
|
105 |
-
A list of
|
106 |
-
Returns an empty list if input is invalid.
|
107 |
"""
|
108 |
if not isinstance(input_text, str):
|
109 |
return []
|
@@ -112,8 +141,11 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
|
|
112 |
input_words = input_text.split() # Split by any whitespace
|
113 |
|
114 |
for i in range(num_chunk):
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
117 |
chunk = " ".join(input_words[start_index:end_index])
|
118 |
if chunk: # Only append non-empty chunks
|
119 |
chunks.append(chunk)
|
@@ -121,11 +153,20 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
|
|
121 |
return chunks
|
122 |
|
123 |
|
124 |
-
def get_keywords(text, num_keywords=
|
125 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
126 |
|
|
|
|
|
|
|
|
|
127 |
# Create a TF-IDF Vectorizer
|
128 |
-
vectorizer = TfidfVectorizer(stop_words=
|
129 |
|
130 |
# Fit and transform the text
|
131 |
tfidf_matrix = vectorizer.fit_transform([text])
|
@@ -144,7 +185,7 @@ def get_keywords(text, num_keywords=5):
|
|
144 |
return [word for word, score in word_scores[:num_keywords]]
|
145 |
|
146 |
|
147 |
-
def generate_search_phrases(input_text):
|
148 |
"""
|
149 |
Generates different types of phrases for search purposes.
|
150 |
|
@@ -156,6 +197,7 @@ def generate_search_phrases(input_text):
|
|
156 |
- A list of most frequent words.
|
157 |
- The original input text.
|
158 |
- A list of text chunks.
|
|
|
159 |
"""
|
160 |
if not isinstance(input_text, str):
|
161 |
return []
|
@@ -171,7 +213,7 @@ def generate_search_phrases(input_text):
|
|
171 |
# Method 3: Split text by chunks
|
172 |
search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
|
173 |
|
174 |
-
# Method 4:
|
175 |
entities = extract_entities(input_text)
|
176 |
text_without_entities = remove_identities_from_text(input_text, entities)
|
177 |
search_phrases.append(text_without_entities)
|
@@ -182,7 +224,7 @@ def generate_search_phrases(input_text):
|
|
182 |
return search_phrases
|
183 |
|
184 |
|
185 |
-
def remove_identities_from_text(input_text, entities):
|
186 |
"""
|
187 |
Removes entities from the input text.
|
188 |
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
+
|
6 |
import string
|
7 |
from collections import Counter
|
8 |
|
9 |
import requests
|
|
|
10 |
from nltk.corpus import stopwords
|
11 |
from nltk.tokenize import word_tokenize
|
12 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
|
14 |
+
from src.application.config import (
|
15 |
+
CHUNK_SIZE,
|
16 |
+
GOOGLE_API_KEY,
|
17 |
+
GOOGLE_ENDPOINT_URL,
|
18 |
+
NUM_CHUNKS,
|
19 |
+
NUM_FREQUENT_WORDS,
|
20 |
+
NUM_KEYWORDS,
|
21 |
+
SEARCH_ENGINE_ID,
|
22 |
+
STOPWORDS_LANG,
|
23 |
+
TOP_SEARCH_RESUTLS,
|
24 |
+
)
|
25 |
from src.application.text.entity import extract_entities
|
26 |
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def search_by_google(
|
29 |
query,
|
30 |
+
num_results=TOP_SEARCH_RESUTLS,
|
31 |
is_exact_terms=False,
|
32 |
) -> dict:
|
33 |
"""
|
34 |
+
Performs a Google Custom Search API query.
|
35 |
|
36 |
Args:
|
37 |
+
query (str): The search query string.
|
38 |
+
num_results (int, optional): The number of search results to return.
|
39 |
+
Defaults to TOP_SEARCH_RESUTLS.
|
40 |
+
is_exact_terms (bool, optional): use an exact phrase search or not.
|
41 |
+
Defaults to False.
|
42 |
|
43 |
Returns:
|
44 |
+
dict: JSON response from the Google Custom Search API,
|
45 |
+
None if an error occurs.
|
46 |
"""
|
47 |
|
|
|
48 |
params = {
|
49 |
"key": GOOGLE_API_KEY,
|
50 |
"cx": SEARCH_ENGINE_ID,
|
|
|
55 |
else:
|
56 |
params["q"] = query.replace('"', "")
|
57 |
|
58 |
+
response = requests.get(GOOGLE_ENDPOINT_URL, params=params)
|
59 |
if response.status_code == 200:
|
60 |
return response.json()
|
61 |
else:
|
|
|
63 |
return None
|
64 |
|
65 |
|
66 |
+
def get_most_frequent_words(
|
67 |
+
input_text: str,
|
68 |
+
number_word: int = NUM_FREQUENT_WORDS,
|
69 |
+
) -> str:
|
70 |
"""
|
71 |
+
Extracts the most frequent words from the input text
|
72 |
+
and forms a search phrase.
|
73 |
|
74 |
Args:
|
75 |
+
input_text (str): The text from which to extract frequent words.
|
76 |
+
number_word (int, optional): The number of frequent words to extract.
|
77 |
|
78 |
Returns:
|
79 |
+
str: A search phrase consisting of the most frequent words.
|
|
|
80 |
"""
|
81 |
+
# Check if the input text is valid
|
82 |
if not isinstance(input_text, str) or not input_text:
|
83 |
+
return None
|
84 |
+
|
85 |
+
# Tokenize the input text into words and convert to lowercase
|
86 |
+
words = word_tokenize(input_text.lower())
|
87 |
|
88 |
+
# Get the set of stop words for the specified language
|
89 |
+
stop_words = set(stopwords.words(STOPWORDS_LANG))
|
90 |
|
91 |
+
# Get the set of punctuation characters
|
92 |
+
punctuation = set(string.punctuation)
|
93 |
+
|
94 |
+
# Filter out stop words, punctuation, and non-alphanumeric words
|
95 |
filtered_words = [
|
96 |
word
|
97 |
for word in words
|
|
|
99 |
and word not in stop_words
|
100 |
and word not in punctuation
|
101 |
]
|
102 |
+
|
103 |
+
# Count the frequency of each filtered word
|
104 |
word_frequencies = Counter(filtered_words)
|
105 |
+
|
106 |
+
# Get the most common words and their frequencies
|
107 |
top_words = word_frequencies.most_common(number_word)
|
108 |
|
109 |
for top_word in top_words:
|
110 |
words.append(top_word[0])
|
111 |
|
112 |
+
# Construct the search phrase
|
113 |
+
if len(words) > NUM_FREQUENT_WORDS:
|
114 |
+
search_phrase = " ".join(words[:NUM_FREQUENT_WORDS])
|
115 |
else:
|
116 |
search_phrase = " ".join(words[:number_word])
|
117 |
|
118 |
return search_phrase
|
119 |
|
120 |
|
121 |
+
def get_chunk(
|
122 |
+
input_text: str,
|
123 |
+
chunk_size: int = CHUNK_SIZE,
|
124 |
+
num_chunk: int = NUM_CHUNKS,
|
125 |
+
) -> list[str]:
|
126 |
"""
|
127 |
+
Splits the input text into chunks of a specified size.
|
128 |
|
129 |
Args:
|
130 |
+
input_text (str): The text to be chunked.
|
131 |
+
chunk_size (int, optional): The number of words per chunk.
|
132 |
+
num_chunk (int, optional): The number of chunks to generate.
|
133 |
|
134 |
Returns:
|
135 |
+
list: A list of chunks of the input text.
|
|
|
136 |
"""
|
137 |
if not isinstance(input_text, str):
|
138 |
return []
|
|
|
141 |
input_words = input_text.split() # Split by any whitespace
|
142 |
|
143 |
for i in range(num_chunk):
|
144 |
+
# Calculate the start and end indices for the current chunk
|
145 |
+
start_index = i * chunk_size
|
146 |
+
end_index = (i + 1) * chunk_size
|
147 |
+
|
148 |
+
# Extract the words for the current chunk and join them into a string
|
149 |
chunk = " ".join(input_words[start_index:end_index])
|
150 |
if chunk: # Only append non-empty chunks
|
151 |
chunks.append(chunk)
|
|
|
153 |
return chunks
|
154 |
|
155 |
|
156 |
+
def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]:
|
157 |
+
"""
|
158 |
+
Extracts the top keywords from a given text using the TF-IDF method.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
text (str): The input text from which to extract keywords.
|
162 |
+
num_keywords (int, optional): The number of top keywords to return.
|
163 |
|
164 |
+
Returns:
|
165 |
+
list: A list of strings representing the top keywords extracted
|
166 |
+
from the text.
|
167 |
+
"""
|
168 |
# Create a TF-IDF Vectorizer
|
169 |
+
vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG)
|
170 |
|
171 |
# Fit and transform the text
|
172 |
tfidf_matrix = vectorizer.fit_transform([text])
|
|
|
185 |
return [word for word, score in word_scores[:num_keywords]]
|
186 |
|
187 |
|
188 |
+
def generate_search_phrases(input_text: str) -> list[str]:
|
189 |
"""
|
190 |
Generates different types of phrases for search purposes.
|
191 |
|
|
|
197 |
- A list of most frequent words.
|
198 |
- The original input text.
|
199 |
- A list of text chunks.
|
200 |
+
- A text without entities.
|
201 |
"""
|
202 |
if not isinstance(input_text, str):
|
203 |
return []
|
|
|
213 |
# Method 3: Split text by chunks
|
214 |
search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
|
215 |
|
216 |
+
# Method 4: Remove identities and key words
|
217 |
entities = extract_entities(input_text)
|
218 |
text_without_entities = remove_identities_from_text(input_text, entities)
|
219 |
search_phrases.append(text_without_entities)
|
|
|
224 |
return search_phrases
|
225 |
|
226 |
|
227 |
+
def remove_identities_from_text(input_text: str, entities: list[str]) -> str:
|
228 |
"""
|
229 |
Removes entities from the input text.
|
230 |
|
src/application/text/search_detection.py
CHANGED
@@ -1,14 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import warnings
|
2 |
-
from difflib import SequenceMatcher
|
3 |
|
4 |
-
import nltk
|
5 |
import numpy as np
|
6 |
-
import
|
7 |
-
from sentence_transformers import
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
)
|
11 |
-
|
12 |
from src.application.text.preprocessing import split_into_sentences
|
13 |
from src.application.text.search import (
|
14 |
generate_search_phrases,
|
@@ -18,39 +26,43 @@ from src.application.url_reader import URLReader
|
|
18 |
|
19 |
warnings.simplefilter(action="ignore", category=FutureWarning)
|
20 |
|
21 |
-
# Download necessary NLTK data files
|
22 |
-
nltk.download("punkt", quiet=True)
|
23 |
-
nltk.download("punkt_tab", quiet=True)
|
24 |
-
nltk.download("stopwords", quiet=True)
|
25 |
-
|
26 |
-
# load the model
|
27 |
-
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
28 |
-
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
29 |
-
PARAPHASE_MODEL.to(DEVICE)
|
30 |
-
|
31 |
-
PARAPHRASE_THRESHOLD_HUMAN = 0.963
|
32 |
-
PARAPHRASE_THRESHOLD_MACHINE = 0.8
|
33 |
-
PARAPHRASE_THRESHOLD = 0.8
|
34 |
-
|
35 |
-
MIN_SAME_SENTENCE_LEN = 6
|
36 |
-
MIN_PHRASE_SENTENCE_LEN = 10
|
37 |
-
MIN_RATIO_PARAPHRASE_NUM = 0.5
|
38 |
-
MAX_CHAR_SIZE = 30000
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
searched_phrases = generate_search_phrases(text[text_index])
|
45 |
|
46 |
for candidate in searched_phrases:
|
|
|
47 |
search_results = search_by_google(candidate)
|
|
|
|
|
48 |
urls = [item["link"] for item in search_results.get("items", [])]
|
49 |
|
50 |
-
|
51 |
-
|
|
|
52 |
continue
|
53 |
-
if "bbc.com" not in url:
|
54 |
continue
|
55 |
|
56 |
checked_urls.add(url)
|
@@ -96,13 +108,13 @@ def find_sentence_source(text, text_index, sentences_df):
|
|
96 |
if c in sentences_df.columns:
|
97 |
sentences_df.loc[text_index, c] = aligned_sentence[c]
|
98 |
|
|
|
99 |
for idx, _ in sentences_df.iterrows():
|
100 |
similarity = sentences_df.loc[idx, "similarity"]
|
101 |
if similarity is not None:
|
102 |
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
|
103 |
continue
|
104 |
|
105 |
-
# find matched content in new url
|
106 |
aligned_sentence = check_paraphrase(
|
107 |
text[idx],
|
108 |
source_text,
|
@@ -125,141 +137,56 @@ def find_sentence_source(text, text_index, sentences_df):
|
|
125 |
sentences_df.loc[idx, c] = aligned_sentence[c]
|
126 |
return sentences_df, content.images
|
127 |
|
|
|
128 |
sentences_df.loc[text_index, "input"] = text[text_index]
|
129 |
return sentences_df, []
|
130 |
|
131 |
|
132 |
-
def
|
133 |
-
"""
|
134 |
-
Finds the length of the longest common subsequence (contiguous) between
|
135 |
-
two arrays.
|
136 |
-
|
137 |
-
Args:
|
138 |
-
arr1: The first array.
|
139 |
-
arr2: The second array.
|
140 |
-
|
141 |
-
Returns:
|
142 |
-
The length of the longest common subsequence.
|
143 |
-
Returns 0 if either input is invalid.
|
144 |
-
"""
|
145 |
-
|
146 |
-
if not isinstance(arr1, list) or not isinstance(arr2, list):
|
147 |
-
return 0
|
148 |
-
|
149 |
-
n = len(arr1)
|
150 |
-
m = len(arr2)
|
151 |
-
|
152 |
-
if n == 0 or m == 0: # handle empty list
|
153 |
-
return 0
|
154 |
-
|
155 |
-
# Create table dp with size (n+1) x (m+1)
|
156 |
-
dp = [[0] * (m + 1) for _ in range(n + 1)]
|
157 |
-
max_length = 0
|
158 |
-
|
159 |
-
for i in range(1, n + 1):
|
160 |
-
for j in range(1, m + 1):
|
161 |
-
if arr1[i - 1] == arr2[j - 1]:
|
162 |
-
dp[i][j] = dp[i - 1][j - 1] + 1
|
163 |
-
max_length = max(max_length, dp[i][j])
|
164 |
-
else:
|
165 |
-
dp[i][j] = 0 # set 0 since the array must be consecutive
|
166 |
-
|
167 |
-
return max_length
|
168 |
-
|
169 |
-
|
170 |
-
def check_sentence(
|
171 |
-
input_sentence,
|
172 |
-
source_sentence,
|
173 |
-
min_same_sentence_len,
|
174 |
-
min_phrase_sentence_len,
|
175 |
-
verbose=False,
|
176 |
-
):
|
177 |
-
"""
|
178 |
-
Checks if two sentences are similar based on exact match or
|
179 |
-
longest common subsequence.
|
180 |
-
|
181 |
-
Args:
|
182 |
-
input_sentence: The input sentence.
|
183 |
-
source_sentence: The source sentence.
|
184 |
-
min_same_sentence_len: Minimum length for exact sentence match.
|
185 |
-
min_phrase_sentence_len: Minimum length for common subsequence match.
|
186 |
-
verbose: If True, print debug information.
|
187 |
-
|
188 |
-
Returns:
|
189 |
-
True if the sentences are considered similar, False otherwise.
|
190 |
-
Returns False if input is not valid.
|
191 |
-
"""
|
192 |
-
|
193 |
-
if not isinstance(input_sentence, str) or not isinstance(
|
194 |
-
source_sentence,
|
195 |
-
str,
|
196 |
-
):
|
197 |
-
return False
|
198 |
-
|
199 |
-
input_sentence = input_sentence.strip()
|
200 |
-
source_sentence = source_sentence.strip()
|
201 |
-
|
202 |
-
if not input_sentence or not source_sentence: # handle empty string
|
203 |
-
return False
|
204 |
-
|
205 |
-
input_words = input_sentence.split() # split without arguments
|
206 |
-
source_words = source_sentence.split() # split without arguments
|
207 |
-
|
208 |
-
if (
|
209 |
-
input_sentence == source_sentence
|
210 |
-
and len(input_words) >= min_same_sentence_len
|
211 |
-
):
|
212 |
-
if verbose:
|
213 |
-
print("Exact match found.")
|
214 |
-
return True
|
215 |
-
|
216 |
-
max_overlap_len = longest_common_subsequence(input_words, source_words)
|
217 |
-
if verbose:
|
218 |
-
print(f"Max overlap length: {max_overlap_len}") # print overlap length
|
219 |
-
if max_overlap_len >= min_phrase_sentence_len:
|
220 |
-
return True
|
221 |
-
|
222 |
-
return False
|
223 |
-
|
224 |
-
|
225 |
-
def check_paraphrase(input_text, source_text, url):
|
226 |
"""
|
227 |
-
Checks if the input text is
|
|
|
228 |
|
229 |
Args:
|
230 |
-
input_text: The text to
|
231 |
-
|
232 |
-
url
|
233 |
|
234 |
Returns:
|
235 |
-
A
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
237 |
"""
|
238 |
-
|
239 |
# Extract sentences from input text and web page
|
240 |
input_sentences = split_into_sentences(input_text)
|
241 |
|
242 |
if not source_text:
|
243 |
return {}
|
244 |
-
|
245 |
source_sentences = split_into_sentences(source_text)
|
|
|
246 |
if not input_sentences or not source_sentences:
|
247 |
return {}
|
248 |
|
|
|
|
|
249 |
additional_sentences = []
|
250 |
for sentence in source_sentences:
|
251 |
if ", external" in sentence:
|
252 |
additional_sentences.append(sentence.replace(", external", ""))
|
253 |
source_sentences.extend(additional_sentences)
|
254 |
|
255 |
-
# Encode sentences into embeddings
|
256 |
-
embeddings1 =
|
257 |
input_sentences,
|
258 |
convert_to_tensor=True,
|
259 |
device=DEVICE,
|
260 |
show_progress_bar=False,
|
261 |
)
|
262 |
-
embeddings2 =
|
263 |
source_sentences,
|
264 |
convert_to_tensor=True,
|
265 |
device=DEVICE,
|
@@ -272,78 +199,53 @@ def check_paraphrase(input_text, source_text, url):
|
|
272 |
# Find sentence alignments
|
273 |
inputs = ""
|
274 |
sources = ""
|
275 |
-
similarities = []
|
276 |
-
|
277 |
for i, sentence in enumerate(input_sentences):
|
278 |
max_sim_index = np.argmax(similarity_matrix[i])
|
279 |
max_similarity = similarity_matrix[i][max_sim_index]
|
280 |
best_matched_sentence = source_sentences[max_sim_index]
|
281 |
-
|
282 |
inputs += sentence + " "
|
283 |
sources += best_matched_sentence + " "
|
284 |
similarities.append(max_similarity)
|
285 |
|
286 |
-
|
287 |
similarity = sum(similarities) / len(similarities)
|
288 |
label, is_paraphrased = determine_label(max_similarity)
|
|
|
|
|
289 |
alignment = {
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
|
|
297 |
print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
|
298 |
|
299 |
return alignment
|
300 |
|
301 |
|
302 |
-
def
|
303 |
"""
|
304 |
-
|
305 |
|
306 |
Args:
|
307 |
-
|
308 |
-
b: The second string.
|
309 |
-
|
310 |
-
Returns:
|
311 |
-
A float representing the similarity ratio between 0.0 and 1.0.
|
312 |
-
Returns 0.0 if either input is None or not a string.
|
313 |
-
"""
|
314 |
-
if (
|
315 |
-
not isinstance(a, str)
|
316 |
-
or not isinstance(b, str)
|
317 |
-
or a is None
|
318 |
-
or b is None
|
319 |
-
):
|
320 |
-
return 0.0 # Handle cases where inputs are not strings or None
|
321 |
-
return SequenceMatcher(None, a, b).ratio()
|
322 |
-
|
323 |
-
|
324 |
-
def check_human(alligned_sentences):
|
325 |
-
"""
|
326 |
-
Checks if a sufficient number of input sentences are found within
|
327 |
-
source sentences.
|
328 |
|
329 |
Returns:
|
330 |
-
|
|
|
331 |
"""
|
332 |
-
if not alligned_sentences: # Handle empty data case
|
333 |
-
return False
|
334 |
-
|
335 |
-
if alligned_sentences["similarity"] >= 0.99:
|
336 |
-
return True
|
337 |
-
return False
|
338 |
-
|
339 |
-
|
340 |
-
def determine_label(similarity):
|
341 |
if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
|
342 |
-
return "HUMAN", True
|
343 |
elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
|
344 |
-
return "MACHINE", True
|
345 |
else:
|
346 |
-
return None, False
|
347 |
|
348 |
|
349 |
if __name__ == "__main__":
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
+
|
6 |
import warnings
|
|
|
7 |
|
|
|
8 |
import numpy as np
|
9 |
+
from pandas import DataFrame
|
10 |
+
from sentence_transformers import util
|
11 |
+
|
12 |
+
from src.application.config import (
|
13 |
+
DEVICE,
|
14 |
+
MAX_CHAR_SIZE,
|
15 |
+
PARAPHRASE_MODEL,
|
16 |
+
PARAPHRASE_THRESHOLD_HUMAN,
|
17 |
+
PARAPHRASE_THRESHOLD_MACHINE,
|
18 |
+
TOP_URLS_PER_SEARCH,
|
19 |
)
|
|
|
20 |
from src.application.text.preprocessing import split_into_sentences
|
21 |
from src.application.text.search import (
|
22 |
generate_search_phrases,
|
|
|
26 |
|
27 |
warnings.simplefilter(action="ignore", category=FutureWarning)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
def find_sentence_source(
|
31 |
+
text: list,
|
32 |
+
text_index: str,
|
33 |
+
sentences_df: DataFrame,
|
34 |
+
) -> tuple[DataFrame, list]:
|
35 |
+
"""
|
36 |
+
Finds the source URL for a given sentence by searching Google
|
37 |
+
and checking for paraphrases.
|
38 |
|
39 |
+
Args:
|
40 |
+
text (list): A list of sentences.
|
41 |
+
text_index (int): The index of the sentence to find the source for.
|
42 |
+
sentences_df (pd.DataFrame): A DF to store sentence information.
|
43 |
|
44 |
+
Returns:
|
45 |
+
tuple: A tuple of the updated sentences_df and a list of image URLs.
|
46 |
+
If a source is found, the DF is updated with source information.
|
47 |
+
If no source is found, the DF is updated with the original input.
|
48 |
+
"""
|
49 |
+
checked_urls = (
|
50 |
+
set()
|
51 |
+
) # Keep track of visited URLs to avoid redundant checks
|
52 |
searched_phrases = generate_search_phrases(text[text_index])
|
53 |
|
54 |
for candidate in searched_phrases:
|
55 |
+
# Search Google for the generated phrase
|
56 |
search_results = search_by_google(candidate)
|
57 |
+
|
58 |
+
# Extract URLs from search results
|
59 |
urls = [item["link"] for item in search_results.get("items", [])]
|
60 |
|
61 |
+
# Check the top 3 URLs from the search results
|
62 |
+
for url in urls[:TOP_URLS_PER_SEARCH]:
|
63 |
+
if url in checked_urls: # Skip already checked URLs
|
64 |
continue
|
65 |
+
if "bbc.com" not in url: # TODO: remove when releasing
|
66 |
continue
|
67 |
|
68 |
checked_urls.add(url)
|
|
|
108 |
if c in sentences_df.columns:
|
109 |
sentences_df.loc[text_index, c] = aligned_sentence[c]
|
110 |
|
111 |
+
# Check other sentences for better matches in the same source
|
112 |
for idx, _ in sentences_df.iterrows():
|
113 |
similarity = sentences_df.loc[idx, "similarity"]
|
114 |
if similarity is not None:
|
115 |
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
|
116 |
continue
|
117 |
|
|
|
118 |
aligned_sentence = check_paraphrase(
|
119 |
text[idx],
|
120 |
source_text,
|
|
|
137 |
sentences_df.loc[idx, c] = aligned_sentence[c]
|
138 |
return sentences_df, content.images
|
139 |
|
140 |
+
# If no source is found, update the DF with the original input
|
141 |
sentences_df.loc[text_index, "input"] = text[text_index]
|
142 |
return sentences_df, []
|
143 |
|
144 |
|
145 |
+
def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
"""
|
147 |
+
Checks if the input text is a paraphrase of the source text
|
148 |
+
by comparing sentence-level similarities.
|
149 |
|
150 |
Args:
|
151 |
+
input_text (str): The text to be checked for paraphrasing.
|
152 |
+
source_text (str): The source text to compare against.
|
153 |
+
url (str): The URL of the source text (for storing in the result).
|
154 |
|
155 |
Returns:
|
156 |
+
dict: A dictionary containing the alignment information, including:
|
157 |
+
- "input": Concatenated input sentences.
|
158 |
+
- "source": Concatenated best-matched source sentences.
|
159 |
+
- "similarity": Average cosine similarity score.
|
160 |
+
- "label": Label determined based on similarity.
|
161 |
+
- "paraphrase": Boolean indicating if it's a paraphrase.
|
162 |
+
- "url": The source URL.
|
163 |
"""
|
|
|
164 |
# Extract sentences from input text and web page
|
165 |
input_sentences = split_into_sentences(input_text)
|
166 |
|
167 |
if not source_text:
|
168 |
return {}
|
|
|
169 |
source_sentences = split_into_sentences(source_text)
|
170 |
+
|
171 |
if not input_sentences or not source_sentences:
|
172 |
return {}
|
173 |
|
174 |
+
# Handle external references in source sentences
|
175 |
+
# This is specified for bbc news articles
|
176 |
additional_sentences = []
|
177 |
for sentence in source_sentences:
|
178 |
if ", external" in sentence:
|
179 |
additional_sentences.append(sentence.replace(", external", ""))
|
180 |
source_sentences.extend(additional_sentences)
|
181 |
|
182 |
+
# Encode sentences into embeddings using the PARAPHASE_MODEL
|
183 |
+
embeddings1 = PARAPHRASE_MODEL.encode(
|
184 |
input_sentences,
|
185 |
convert_to_tensor=True,
|
186 |
device=DEVICE,
|
187 |
show_progress_bar=False,
|
188 |
)
|
189 |
+
embeddings2 = PARAPHRASE_MODEL.encode(
|
190 |
source_sentences,
|
191 |
convert_to_tensor=True,
|
192 |
device=DEVICE,
|
|
|
199 |
# Find sentence alignments
|
200 |
inputs = ""
|
201 |
sources = ""
|
202 |
+
similarities = []
|
203 |
+
|
204 |
for i, sentence in enumerate(input_sentences):
|
205 |
max_sim_index = np.argmax(similarity_matrix[i])
|
206 |
max_similarity = similarity_matrix[i][max_sim_index]
|
207 |
best_matched_sentence = source_sentences[max_sim_index]
|
208 |
+
|
209 |
inputs += sentence + " "
|
210 |
sources += best_matched_sentence + " "
|
211 |
similarities.append(max_similarity)
|
212 |
|
213 |
+
# Calculate average similarity and determine paraphrase label
|
214 |
similarity = sum(similarities) / len(similarities)
|
215 |
label, is_paraphrased = determine_label(max_similarity)
|
216 |
+
|
217 |
+
# Create the alignment dictionary
|
218 |
alignment = {
|
219 |
+
"input": inputs,
|
220 |
+
"source": sources,
|
221 |
+
"similarity": similarity,
|
222 |
+
"label": label,
|
223 |
+
"paraphrase": is_paraphrased,
|
224 |
+
"url": url,
|
225 |
+
}
|
226 |
+
|
227 |
print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
|
228 |
|
229 |
return alignment
|
230 |
|
231 |
|
232 |
+
def determine_label(similarity: float) -> tuple[str | None, bool]:
|
233 |
"""
|
234 |
+
Determines a label and paraphrase status based on the similarity score.
|
235 |
|
236 |
Args:
|
237 |
+
similarity (float): The similarity score between two texts.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
Returns:
|
240 |
+
tuple: A tuple containing the label (str or None)
|
241 |
+
and a boolean indicating if it's a paraphrase.
|
242 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
|
244 |
+
return "HUMAN", True # Human paraphrase
|
245 |
elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
|
246 |
+
return "MACHINE", True # Machine paraphrase
|
247 |
else:
|
248 |
+
return None, False # Not a paraphrase
|
249 |
|
250 |
|
251 |
if __name__ == "__main__":
|
test.py
CHANGED
@@ -1,74 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
"""
|
5 |
-
Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
|
6 |
-
|
7 |
-
Args:
|
8 |
-
text1: The text to search for.
|
9 |
-
text2: The text to search within.
|
10 |
-
|
11 |
-
Returns:
|
12 |
-
A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
|
13 |
-
"""
|
14 |
-
|
15 |
-
match = re.search(re.escape(text1), text2) #escape text1 to handle special characters
|
16 |
-
|
17 |
-
if match:
|
18 |
-
# Find the next non-space character
|
19 |
-
next_char_index = match.end()
|
20 |
-
while next_char_index < len(text2) and text2[next_char_index].isspace():
|
21 |
-
next_char_index += 1
|
22 |
-
|
23 |
-
if text2[next_char_index:next_char_index+2] == r'\n':
|
24 |
-
print("newline found")
|
25 |
-
if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
|
26 |
-
return True
|
27 |
-
|
28 |
-
return False
|
29 |
-
|
30 |
-
def is_newline_after_text_2(text1, text2):
|
31 |
-
"""
|
32 |
-
Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
|
33 |
-
|
34 |
-
Args:
|
35 |
-
text1: The text to search for.
|
36 |
-
text2: The text to search within.
|
37 |
-
|
38 |
-
Returns:
|
39 |
-
True if next char is newline
|
40 |
-
"""
|
41 |
-
text2 = text2.replace("\n", "\\n")
|
42 |
-
|
43 |
-
ater_text = text2.split(text1)
|
44 |
-
if len(ater_text) > 1:
|
45 |
-
ater_text = ater_text[1].lstrip() # Remove spaces
|
46 |
-
if ater_text.startswith('\n'):
|
47 |
-
return True
|
48 |
-
return False
|
49 |
-
|
50 |
-
# Example usage:
|
51 |
-
text1 = "hello"
|
52 |
-
text2 = "some text hello \nmore text"
|
53 |
-
result = is_newline_after_text_2(text1, text2)
|
54 |
-
print(f"Next char is newline: {result}\n")
|
55 |
-
|
56 |
-
text1 = "hello"
|
57 |
-
text2 = "some text hello more text"
|
58 |
-
result = is_newline_after_text_2(text1, text2)
|
59 |
-
print(f"Next char is newline: {result}\n")
|
60 |
-
|
61 |
-
text1 = "hello"
|
62 |
-
text2 = "some text hello \nmore text"
|
63 |
-
result = is_newline_after_text_2(text1, text2)
|
64 |
-
print(f"Next char is newline: {result}\n")
|
65 |
-
|
66 |
-
text1 = "hello"
|
67 |
-
text2 = "some text hello\t\nmore text" #test tab space before newline
|
68 |
-
result = is_newline_after_text_2(text1, text2)
|
69 |
-
print(f"Next char is newline: {result}\n")
|
70 |
-
|
71 |
-
text1 = "hello." #test special characters
|
72 |
-
text2 = "some text hello. \nmore text"
|
73 |
-
result = is_newline_after_text_2(text1, text2)
|
74 |
-
print(f"Next char is newline: {result}\n")
|
|
|
1 |
+
a = [1, 2]
|
2 |
+
a.append(None)
|
3 |
+
print(a)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|