pmkhanh7890 commited on
Commit
0827f9d
·
1 Parent(s): a5e8d12

Add comments to text module

Browse files
application.py CHANGED
@@ -125,7 +125,7 @@ FOR GOVERNOR<br>
125
  - Each highlighted pair (marked with a number) shows the key differences
126
  between the input text and the source.
127
  """
128
- table = f"""
129
  <h5>Comparison between input news and source news:</h5>
130
  <table border="1" style="width:100%; text-align:left;">
131
  <col style="width: 170px;">
@@ -144,7 +144,7 @@ between the input text and the source.
144
  <tr>
145
  <td style="border-bottom: 1px solid transparent";>TBD</td>
146
  <td style="border-bottom: 1px solid transparent";>TBD</td>
147
- <td rowspan="2"> <img src="https://huggingface.co/spaces/pmkhanh7890/news_verification/resolve/main/examples/example_image_input.jpg" alt="A picture of a cat."></td>
148
  <td rowspan="2">TBD</td>
149
  </tr>
150
  <tr>
 
125
  - Each highlighted pair (marked with a number) shows the key differences
126
  between the input text and the source.
127
  """
128
+ table = """
129
  <h5>Comparison between input news and source news:</h5>
130
  <table border="1" style="width:100%; text-align:left;">
131
  <col style="width: 170px;">
 
144
  <tr>
145
  <td style="border-bottom: 1px solid transparent";>TBD</td>
146
  <td style="border-bottom: 1px solid transparent";>TBD</td>
147
+ <td rowspan="2">TBD</td>
148
  <td rowspan="2">TBD</td>
149
  </tr>
150
  <tr>
gpt_test.py CHANGED
@@ -96,7 +96,7 @@ azure_client = AzureOpenAI(
96
  api_version="2024-05-01-preview",
97
  )
98
 
99
- deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
100
  TEXT_PROMPT = """
101
  Paraphrase the following news, only output the paraphrased text:
102
 
 
96
  api_version="2024-05-01-preview",
97
  )
98
 
99
+ deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
100
  TEXT_PROMPT = """
101
  Paraphrase the following news, only output the paraphrased text:
102
 
src/application/config.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Download necessary NLTK data files
2
+ """
3
+ Author: Khanh Phan
4
+ Date: 2024-12-04
5
+ """
6
+ import os
7
+
8
+ import nltk
9
+ import openai
10
+ import torch
11
+ from dotenv import load_dotenv
12
+ from sentence_transformers import SentenceTransformer
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
17
+ SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
18
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
19
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
20
+ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
21
+
22
+ # GPT Model
23
+ GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini"
24
+ GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
25
+ AZUREOPENAI_CLIENT = openai.AzureOpenAI(
26
+ api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION,
27
+ api_key=AZURE_OPENAI_API_KEY,
28
+ azure_endpoint=AZURE_OPENAI_ENDPOINT,
29
+ )
30
+
31
+ # Download the resources
32
+ nltk.download("punkt", quiet=True) # Sentence tokenization
33
+ nltk.download("punkt_tab", quiet=True) # Tokenization with tab-separated data
34
+ nltk.download("stopwords", quiet=True) # A list of stop words
35
+ STOPWORDS_LANG = "english"
36
+
37
+ # Load PARAPHASE_MODEL
38
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
+ PARAPHRASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
40
+ PARAPHRASE_MODEL.to(DEVICE)
41
+
42
+ # Model to detect AI-generated text
43
+ AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
44
+
45
+ # Thresholds
46
+ PARAPHRASE_THRESHOLD_HUMAN = 0.963
47
+ PARAPHRASE_THRESHOLD_MACHINE = 0.8
48
+ PARAPHRASE_THRESHOLD = 0.8
49
+
50
+ MIN_SAME_SENTENCE_LEN = 6
51
+ MIN_PHRASE_SENTENCE_LEN = 10
52
+ MIN_RATIO_PARAPHRASE_NUM = 0.5
53
+ MAX_CHAR_SIZE = 30000
54
+
55
+ # Number of top URLs per search
56
+ TOP_URLS_PER_SEARCH = 3
57
+
58
+ # Search parameters
59
+ GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
60
+ TOP_SEARCH_RESUTLS = 10
61
+ CHUNK_SIZE = 32 # words
62
+ NUM_CHUNKS = 3 # number of chunks to search
63
+ NUM_FREQUENT_WORDS = 32 # number of top words to return
64
+ NUM_KEYWORDS = 5 # number of keywords to return
65
+
66
+ # Labels
67
+ MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
68
+ HUMAN = "HUMAN"
69
+ MACHINE = "MACHINE"
70
+ UNKNOWN = "UNKNOWN"
71
+ PARAPHRASE = "PARAPHRASE"
72
+ NON_PARAPHRASE = "NON_PARAPHRASE"
73
+
74
+ # Entity color
75
+ """
76
+ factor > 1: Lightens the color.
77
+ factor = 1: Leaves the color unchanged.
78
+ factor < 1: Darkens the color.
79
+ factor = 0: Black.
80
+ """
81
+ ENTITY_LIGHTEN_COLOR = 2.2
82
+ ENTITY_DARKEN_COLOR = 0.7
83
+ ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness).
84
+ ENTITY_BRIGHTNESS = 0.75 # color's brightness.
src/application/content_detection.py CHANGED
@@ -16,7 +16,7 @@ from src.application.text.model_detection import (
16
  detect_text_by_ai_model,
17
  predict_generation_model,
18
  )
19
- from src.application.text.preprocessing import split_into_paragraphs, split_into_sentences
20
  from src.application.text.search_detection import (
21
  PARAPHRASE_THRESHOLD_MACHINE,
22
  find_sentence_source,
@@ -112,7 +112,7 @@ class NewsVerification:
112
  na=False,
113
  )
114
  ]
115
-
116
  if len(machine_label) > 0:
117
  label = " ".join(machine_label["label"].tolist())
118
  self.text_prediction_label[0] = label
@@ -147,7 +147,7 @@ class NewsVerification:
147
  print("CHECK TEXT:")
148
  print("\tFrom search engine:")
149
  # Classify by search engine
150
- #input_sentences = split_into_sentences(self.news_text)
151
  input_paragraphs = split_into_paragraphs(self.news_text)
152
 
153
  # Setup df for input_sentences
@@ -402,8 +402,6 @@ class NewsVerification:
402
  if span_row == 1:
403
  last_url_row = True
404
 
405
- # end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
406
-
407
  formatted_row = self.format_text_fact_checker_row(
408
  row,
409
  first_url_row,
@@ -873,11 +871,11 @@ class NewsVerification:
873
 
874
  start_end = list(range(start, end + 1, 1))
875
  start_end = list(set(start_end) - set(ignore_indices))
876
- #new_start, new_end = self.extract_sequences(start_end)
877
  new_start, new_end = self.extract_new_startend(
878
- start,
879
- end,
880
- ignore_indices
881
  )
882
  filtered_starts.extend(new_start)
883
  filtered_ends.extend(new_end)
@@ -885,7 +883,7 @@ class NewsVerification:
885
  return filtered_starts, filtered_ends
886
 
887
  def extract_new_startend(self, start, end, ignore_indices):
888
- # sort a set of ignore_indices
889
  indexes = list(set(ignore_indices))
890
  indexes.sort()
891
 
@@ -896,22 +894,22 @@ class NewsVerification:
896
  new_starts.append(start)
897
  new_ends.append(end)
898
  return new_starts, new_ends
899
-
900
  for index in indexes:
901
  if index < start:
902
  continue
903
  elif index >= end:
904
  continue
905
-
906
  new_starts.append(new_start)
907
  new_ends.append(index)
908
 
909
  new_start = index + 1
910
-
911
  new_starts.append(new_start)
912
  new_ends.append(end)
913
 
914
- return new_starts, new_ends
915
 
916
  def extract_sequences(self, numbers):
917
  if len(numbers) == 1:
 
16
  detect_text_by_ai_model,
17
  predict_generation_model,
18
  )
19
+ from src.application.text.preprocessing import split_into_paragraphs
20
  from src.application.text.search_detection import (
21
  PARAPHRASE_THRESHOLD_MACHINE,
22
  find_sentence_source,
 
112
  na=False,
113
  )
114
  ]
115
+
116
  if len(machine_label) > 0:
117
  label = " ".join(machine_label["label"].tolist())
118
  self.text_prediction_label[0] = label
 
147
  print("CHECK TEXT:")
148
  print("\tFrom search engine:")
149
  # Classify by search engine
150
+ # input_sentences = split_into_sentences(self.news_text)
151
  input_paragraphs = split_into_paragraphs(self.news_text)
152
 
153
  # Setup df for input_sentences
 
402
  if span_row == 1:
403
  last_url_row = True
404
 
 
 
405
  formatted_row = self.format_text_fact_checker_row(
406
  row,
407
  first_url_row,
 
871
 
872
  start_end = list(range(start, end + 1, 1))
873
  start_end = list(set(start_end) - set(ignore_indices))
874
+ # new_start, new_end = self.extract_sequences(start_end)
875
  new_start, new_end = self.extract_new_startend(
876
+ start,
877
+ end,
878
+ ignore_indices,
879
  )
880
  filtered_starts.extend(new_start)
881
  filtered_ends.extend(new_end)
 
883
  return filtered_starts, filtered_ends
884
 
885
  def extract_new_startend(self, start, end, ignore_indices):
886
+ # sort a set of ignore_indices
887
  indexes = list(set(ignore_indices))
888
  indexes.sort()
889
 
 
894
  new_starts.append(start)
895
  new_ends.append(end)
896
  return new_starts, new_ends
897
+
898
  for index in indexes:
899
  if index < start:
900
  continue
901
  elif index >= end:
902
  continue
903
+
904
  new_starts.append(new_start)
905
  new_ends.append(index)
906
 
907
  new_start = index + 1
908
+
909
  new_starts.append(new_start)
910
  new_ends.append(end)
911
 
912
+ return new_starts, new_ends
913
 
914
  def extract_sequences(self, numbers):
915
  if len(numbers) == 1:
src/application/text/entity.py CHANGED
@@ -1,34 +1,49 @@
 
 
 
 
 
1
  import colorsys
2
  import json
3
- import os
4
  import re
5
 
6
  import gradio as gr
7
  import openai
8
- from dotenv import load_dotenv
9
  from transformers import pipeline
10
 
11
- ner_pipeline = pipeline("ner")
12
-
13
- load_dotenv()
14
- AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
15
- AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
16
- AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
17
-
18
- client = openai.AzureOpenAI(
19
- api_version="2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
20
- api_key=AZURE_OPENAI_API_KEY,
21
- azure_endpoint=AZURE_OPENAI_ENDPOINT,
22
  )
23
 
 
 
24
 
25
  def extract_entities_gpt(
26
  original_text,
27
  compared_text,
28
- text_generation_model="o1-mini",
29
- ):
30
- # "gpt-4o-mini" or "o1-mini"
31
- # Generate text using the selected models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  prompt = f"""
33
  Compare the ORIGINAL TEXT and the COMPARED TEXT.
34
  Find entity pairs with significantly different meanings after paraphrasing.
@@ -60,14 +75,15 @@ If there are no entities that satisfy above condition, output empty list "[]".
60
  {compared_text}
61
  """
62
 
63
- # Generate text using the text generation model
64
  # Generate text using the selected model
65
  try:
66
- response = client.chat.completions.create(
 
67
  model=text_generation_model,
68
  messages=[{"role": "user", "content": prompt}],
69
  )
70
 
 
71
  res = response.choices[0].message.content
72
 
73
  except openai.OpenAIError as e:
@@ -77,15 +93,27 @@ If there are no entities that satisfy above condition, output empty list "[]".
77
  return res
78
 
79
 
80
- def read_json(json_string) -> list[list[str]]:
 
 
 
 
 
 
 
 
 
 
81
  try:
 
82
  entities = json.loads(json_string)
 
83
  # Remove duplicates pair of entities
84
  unique_entities = []
85
  for inner_list in entities:
 
86
  if inner_list not in unique_entities:
87
  unique_entities.append(inner_list)
88
-
89
  return unique_entities
90
 
91
  except json.JSONDecodeError as e:
@@ -93,66 +121,94 @@ def read_json(json_string) -> list[list[str]]:
93
  return []
94
 
95
 
96
- def lighten_color(hex_color, factor=1.8):
97
- """Lightens a HEX color by increasing its brightness in HSV space."""
98
-
99
- hex_color = hex_color.lstrip("#")
100
- r, g, b = (
101
- int(hex_color[0:2], 16),
102
- int(hex_color[2:4], 16),
103
- int(hex_color[4:6], 16),
104
- )
105
-
106
- # Convert to HSV
107
- h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
108
- v = min(1.0, v * factor) # Increase brightness
109
-
110
- # Convert back to HEX
111
- r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
112
- return f"#{r:02x}{g:02x}{b:02x}"
113
-
114
 
115
- def darken_color(hex_color, factor=0.7):
116
- """Darkens a hex color by reducing its brightness in the HSV space."""
 
117
 
 
 
 
 
118
  hex_color = hex_color.lstrip("#")
 
 
119
  r, g, b = (
120
- int(hex_color[0:2], 16),
121
- int(hex_color[2:4], 16),
122
- int(hex_color[4:6], 16),
123
  )
124
 
125
- # Convert to HSV to adjust brightness
126
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
127
- v = max(0, v * factor) # Reduce brightness
128
 
129
- # Convert back to HEX
 
 
 
130
  r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
 
 
131
  return f"#{r:02x}{g:02x}{b:02x}"
132
 
133
 
134
- def generate_color(index, total_colors=20):
135
- """Generates a unique, evenly spaced color for each index using HSL."""
 
136
 
 
 
 
 
 
 
 
 
 
 
137
  hue = index / total_colors # Spread hues in range [0,1]
138
- saturation = 0.65 # Keep colors vivid
139
- lightness = 0.75 # Balanced brightness
140
 
141
  # Convert HSL to RGB
142
- r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
 
 
143
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
144
 
145
- return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
 
 
146
 
 
 
 
147
 
148
- def assign_colors_to_entities(entities):
 
 
 
 
 
 
 
 
 
 
 
 
149
  total_colors = len(entities)
150
- # Assign colors to entities
 
151
  entities_colors = []
152
  for index, entity in enumerate(entities):
153
- color = generate_color(index, total_colors)
154
 
155
- # append color and index to entities_colors
156
  entities_colors.append(
157
  {"color": color, "input": entity[0], "source": entity[1]},
158
  )
@@ -160,43 +216,83 @@ def assign_colors_to_entities(entities):
160
  return entities_colors
161
 
162
 
163
- def highlight_entities(text1, text2):
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  if text1 is None or text2 is None:
165
  return None
166
 
 
167
  entities_text = extract_entities_gpt(text1, text2)
168
 
169
- # Clean up entities: remove wrapping characters
170
  entities_text = entities_text.replace("```json", "").replace("```", "")
171
 
 
172
  entities = read_json(entities_text)
 
 
173
  if len(entities) == 0:
174
  return None
175
 
176
- # Assign colors to entities
177
  entities_with_colors = assign_colors_to_entities(entities)
178
 
179
  return entities_with_colors
180
 
181
 
182
- def apply_highlight(text, entities_with_colors, key="input", count=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  if entities_with_colors is None:
184
  return text, []
185
 
 
186
  all_starts = []
187
  all_ends = []
188
  highlighted_text = ""
189
  temp_text = text
 
 
190
  for index, entity in enumerate(entities_with_colors):
191
  highlighted_text = ""
192
 
193
- # find a list of starts and ends of entity in text:
194
- # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
195
- # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
196
  starts = []
197
  ends = []
198
- # "\b" is for bound a word
199
  for m in re.finditer(
 
200
  r"\b" + re.escape(entity[key]) + r"\b",
201
  temp_text,
202
  ):
@@ -206,78 +302,116 @@ def apply_highlight(text, entities_with_colors, key="input", count=0):
206
  all_starts.extend(starts)
207
  all_ends.extend(ends)
208
 
 
209
  color = entities_with_colors[index]["color"]
210
- entity_color = lighten_color(
 
 
211
  color,
212
- factor=2.2,
213
- ) # Lightened color for background text
214
- label_color = darken_color(
 
215
  entity_color,
216
- factor=0.7,
217
- ) # Darker color for background label (index)
218
 
219
- # Apply highlighting to each entity
220
  prev_end = 0
221
  for start, end in zip(starts, ends):
222
- # Append non-highlighted text
223
  highlighted_text += temp_text[prev_end:start]
224
 
225
- # Style the index as a label
226
  index_label = (
227
  f'<span_style="background-color:{label_color};color:white;'
228
  f"padding:1px_4px;border-radius:4px;font-size:12px;"
229
  f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>' # noqa: E501
230
  )
231
 
232
- # Append highlighted text with index label
233
  highlighted_text += (
234
  f'<span_style="background-color:{entity_color};color:black;'
235
  f'border-radius:3px;font-size:14px;display:inline-block;">'
236
  f"{index_label}{temp_text[start:end]}</span>"
237
  )
238
  prev_end = end
 
 
239
  highlighted_text += temp_text[prev_end:]
 
 
240
  temp_text = highlighted_text
241
 
242
  if highlighted_text == "":
243
  return text, []
 
 
244
  highlight_idx_list = get_index_list(highlighted_text)
245
  return highlighted_text, highlight_idx_list
246
 
247
 
248
- def get_index_list(highlighted_text):
249
  """
250
- Generates a list of indices between corresponding start and end indices.
251
 
252
  Args:
253
- starts: A list of starting indices.
254
- ends: A list of ending indices. Must be the same length as starts.
255
 
256
  Returns:
257
- A list containing all indices within the specified ranges.
258
- Returns an empty list if the input is invalid (e.g., different lengths,
259
- end < start, etc.).
260
  """
261
  highlighted_index = []
 
 
262
  words = highlighted_text.split()
263
  for index, word in enumerate(words):
 
264
  if word.startswith("<span_style"):
265
  start_index = index
 
 
266
  if word.endswith("</span>"):
267
  end_index = index
268
-
269
- highlighted_index.extend(list(range(start_index, end_index + 1)))
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  return highlighted_index
272
 
273
 
274
- def extract_entities(text):
 
 
 
 
 
 
 
 
 
 
275
  output = ner_pipeline(text)
 
 
276
  words = extract_words(output)
 
 
277
  words = combine_subwords(words)
278
 
279
- # extract word in each entity and assign to a list of entities,
280
- # connect words if there is no space between them
281
  entities = []
282
  for entity in words:
283
  if entity not in entities:
@@ -286,15 +420,17 @@ def extract_entities(text):
286
  return entities
287
 
288
 
289
- def extract_words(entities):
290
  """
291
  Extracts the words from a list of entities.
292
 
293
  Args:
294
- entities: A list of entities.
 
 
295
 
296
  Returns:
297
- A list of words extracted from the entities.
298
  """
299
  words = []
300
  for entity in entities:
@@ -307,24 +443,26 @@ def combine_subwords(word_list):
307
  Combines subwords (indicated by "##") with the preceding word in a list.
308
 
309
  Args:
310
- word_list: A list of words, where subwords are prefixed with "##".
 
311
 
312
  Returns:
313
- A new list with subwords combined with their preceding words.
 
314
  """
315
  result = []
316
  i = 0
317
  while i < len(word_list):
318
  if word_list[i].startswith("##"):
319
- result[-1] += word_list[i][
320
- 2:
321
- ] # Remove "##" and append to the previous word
322
- elif (
323
- i < len(word_list) - 2 and word_list[i + 1] == "-"
324
- ): # Combine hyphenated words
325
  result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
326
- i += 2 # Skip the next two words
327
  else:
 
 
328
  result.append(word_list[i])
329
  i += 1
330
  return result
@@ -360,6 +498,7 @@ is losing territory in the east. Zelensky praised Japan's commitment
360
  on Thursday, amid wider concerns that the next US President, who is
361
  set to take office on Monday, could potentially reduce aid.
362
  """
 
363
  if __name__ == "__main__":
364
  with gr.Blocks() as demo:
365
  gr.Markdown("### Highlight Matching Parts Between Two Texts")
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
+
6
  import colorsys
7
  import json
 
8
  import re
9
 
10
  import gradio as gr
11
  import openai
 
12
  from transformers import pipeline
13
 
14
+ from src.application.config import (
15
+ AZUREOPENAI_CLIENT,
16
+ ENTITY_BRIGHTNESS,
17
+ ENTITY_DARKEN_COLOR,
18
+ ENTITY_LIGHTEN_COLOR,
19
+ ENTITY_SATURATION,
20
+ GPT_ENTITY_MODEL,
 
 
 
 
21
  )
22
 
23
+ ner_pipeline = pipeline("ner")
24
+
25
 
26
  def extract_entities_gpt(
27
  original_text,
28
  compared_text,
29
+ text_generation_model=GPT_ENTITY_MODEL,
30
+ ) -> str:
31
+ """
32
+ Extracts entity pairs with significantly different meanings between
33
+ two texts using a GPT model.
34
+
35
+ Args:
36
+ original_text (str): The original text.
37
+ compared_text (str): The paraphrased or compared text.
38
+ text_generation_model (str, optional): The GPT model
39
+ to use for entity extraction.
40
+
41
+ Returns:
42
+ str: The JSON-like string containing the extracted entity pairs,
43
+ or an empty string if an error occurs.
44
+ """
45
+
46
+ # Construct the prompt for the GPT model.
47
  prompt = f"""
48
  Compare the ORIGINAL TEXT and the COMPARED TEXT.
49
  Find entity pairs with significantly different meanings after paraphrasing.
 
75
  {compared_text}
76
  """
77
 
 
78
  # Generate text using the selected model
79
  try:
80
+ # Send the prompt to the GPT model and get the response.
81
+ response = AZUREOPENAI_CLIENT.chat.completions.create(
82
  model=text_generation_model,
83
  messages=[{"role": "user", "content": prompt}],
84
  )
85
 
86
+ # Extract the generated content from the response.
87
  res = response.choices[0].message.content
88
 
89
  except openai.OpenAIError as e:
 
93
  return res
94
 
95
 
96
+ def read_json(json_string: str) -> list[list[str, str]]:
97
+ """
98
+ Parses a JSON string and returns a list of unique entity pairs.
99
+
100
+ Args:
101
+ json_string (str): The JSON string to parse.
102
+
103
+ Returns:
104
+ List[List[str, str]]: A list of unique entity pairs,
105
+ or an empty list if parsing fails.
106
+ """
107
  try:
108
+ # Attempt to parse the JSON string into a Python object
109
  entities = json.loads(json_string)
110
+
111
  # Remove duplicates pair of entities
112
  unique_entities = []
113
  for inner_list in entities:
114
+ # Check if the current entity pair is already existed.
115
  if inner_list not in unique_entities:
116
  unique_entities.append(inner_list)
 
117
  return unique_entities
118
 
119
  except json.JSONDecodeError as e:
 
121
  return []
122
 
123
 
124
+ def set_color_brightness(
125
+ hex_color: str,
126
+ brightness_factor: float = ENTITY_LIGHTEN_COLOR,
127
+ ) -> str:
128
+ """
129
+ Lightens a HEX color by increasing its brightness in HSV space.
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ Args:
132
+ hex_color (str): The HEX color code (e.g., "#RRGGBB").
133
+ factor (float, optional): The factor by which to increase brightness.
134
 
135
+ Returns:
136
+ str: The lightened HEX color code.
137
+ """
138
+ # Remove the '#' prefix if present.
139
  hex_color = hex_color.lstrip("#")
140
+
141
+ # Convert the HEX color to RGB (red, green, blue) integers.
142
  r, g, b = (
143
+ int(hex_color[0:2], 16), # Red component
144
+ int(hex_color[2:4], 16), # Green component
145
+ int(hex_color[4:6], 16), # Blue component
146
  )
147
 
148
+ # Convert RGB to HSV (hue, saturation, value/brightness)
149
  h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
 
150
 
151
+ # Increase the brightness by the specified factor, but cap it at 1.0.
152
+ v = min(1.0, v * brightness_factor)
153
+
154
+ # Convert the modified HSV back to RGB
155
  r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
156
+
157
+ # Convert the RGB values back to a HEX color code.
158
  return f"#{r:02x}{g:02x}{b:02x}"
159
 
160
 
161
+ def generate_colors(index: int, total_colors: int = 20) -> str:
162
+ """
163
+ Generates a unique, evenly spaced color for each index using HSL.
164
 
165
+ Args:
166
+ index (int): The index for which to generate a color.
167
+ total_colors (int, optional): The total number of colors to
168
+ distribute evenly. Defaults to 20.
169
+
170
+ Returns:
171
+ str: A HEX color code representing the generated color.
172
+ """
173
+ # Calculate the hue value based on the index and total number of colors.
174
+ # This ensures even distribution of hues across the color spectrum.
175
  hue = index / total_colors # Spread hues in range [0,1]
 
 
176
 
177
  # Convert HSL to RGB
178
+ r, g, b = colorsys.hls_to_rgb(hue, ENTITY_SATURATION, ENTITY_BRIGHTNESS)
179
+
180
+ # Scale the RGB values from [0, 1] to [0, 255]
181
  r, g, b = int(r * 255), int(g * 255), int(b * 255)
182
 
183
+ # Convert to hex
184
+ return f"#{r:02x}{g:02x}{b:02x}"
185
+
186
 
187
+ def assign_colors_to_entities(entities: list) -> list[dict]:
188
+ """
189
+ Assigns unique colors to each entity pair in a list.
190
 
191
+ Args:
192
+ entities (list): A list of entity pairs,
193
+ where each pair is a list of two strings.
194
+ Example: [["entity1_original", "entity1_compared"]]
195
+
196
+ Returns:
197
+ list: A list of dictionaries,
198
+ where each dictionary contains
199
+ - "color": the color of entity pair.
200
+ - "input": the original entity string.
201
+ - "source": the compared entity string.
202
+ """
203
+ # Number of colors needed.
204
  total_colors = len(entities)
205
+
206
+ # Assign colors to entities using their index.
207
  entities_colors = []
208
  for index, entity in enumerate(entities):
209
+ color = generate_colors(index, total_colors)
210
 
211
+ # Append color and index to entities_colors
212
  entities_colors.append(
213
  {"color": color, "input": entity[0], "source": entity[1]},
214
  )
 
216
  return entities_colors
217
 
218
 
219
+ def highlight_entities(text1: str, text2: str) -> list[dict]:
220
+ """
221
+ Highlights entities with significant differences between
222
+ two texts by assigning them unique colors.
223
+
224
+ Args:
225
+ text1 (str): input text.
226
+ text2 (str): source text.
227
+
228
+ Returns:
229
+ list: A list of dictionaries, where each dictionary
230
+ contains the highlighted entity information (color, input, source)
231
+ or None if no significant entities are found or an error occurs.
232
+ """
233
  if text1 is None or text2 is None:
234
  return None
235
 
236
+ # Extract entities with significant differences using a GPT model.
237
  entities_text = extract_entities_gpt(text1, text2)
238
 
239
+ # Clean up the extracted entities string by removing wrapping characters.
240
  entities_text = entities_text.replace("```json", "").replace("```", "")
241
 
242
+ # Parse the cleaned entities string into a Python list of entity pairs.
243
  entities = read_json(entities_text)
244
+
245
+ # If no significant entities are found, return None.
246
  if len(entities) == 0:
247
  return None
248
 
249
+ # Assign unique colors to the extracted entities.
250
  entities_with_colors = assign_colors_to_entities(entities)
251
 
252
  return entities_with_colors
253
 
254
 
255
+ def apply_highlight(
256
+ text: str,
257
+ entities_with_colors: list[dict],
258
+ key: str = "input",
259
+ count: int = 0,
260
+ ) -> tuple[str, list[int]]:
261
+ """
262
+ Applies highlighting to specified entities within a text,
263
+ assigning them unique colors and index labels.
264
+
265
+ Args:
266
+ text (str): The text to highlight.
267
+ entities_with_colors (list): A list of dictionaries,
268
+ where each dictionary represents an entity and its color.
269
+ key (str, optional): The key in the entity dictionary that
270
+ contains the entity text to highlight.
271
+ count (int, optional): An offset to add to the index labels.
272
+
273
+ Returns:
274
+ tuple:
275
+ - A tuple containing the highlighted text (str).
276
+ - A list of index positions (list).
277
+ """
278
  if entities_with_colors is None:
279
  return text, []
280
 
281
+ # Start & end indices of highlighted entities.
282
  all_starts = []
283
  all_ends = []
284
  highlighted_text = ""
285
  temp_text = text
286
+
287
+ # Apply highlighting to each entity.
288
  for index, entity in enumerate(entities_with_colors):
289
  highlighted_text = ""
290
 
 
 
 
291
  starts = []
292
  ends = []
293
+
294
  for m in re.finditer(
295
+ # Word boundaries (\b) and escape special characters
296
  r"\b" + re.escape(entity[key]) + r"\b",
297
  temp_text,
298
  ):
 
302
  all_starts.extend(starts)
303
  all_ends.extend(ends)
304
 
305
+ # Get the colors for each occurrence of the entity.
306
  color = entities_with_colors[index]["color"]
307
+
308
+ # Lightened color for background text
309
+ entity_color = set_color_brightness(
310
  color,
311
+ brightness_factor=ENTITY_LIGHTEN_COLOR,
312
+ )
313
+ # Darker color for background label (index)
314
+ label_color = set_color_brightness(
315
  entity_color,
316
+ brightness_factor=ENTITY_DARKEN_COLOR,
317
+ )
318
 
319
+ # Apply highlighting to each occurrence of the entity.
320
  prev_end = 0
321
  for start, end in zip(starts, ends):
322
+ # Non-highlighted text before the entity.
323
  highlighted_text += temp_text[prev_end:start]
324
 
325
+ # Create the index label with the specified color and style.
326
  index_label = (
327
  f'<span_style="background-color:{label_color};color:white;'
328
  f"padding:1px_4px;border-radius:4px;font-size:12px;"
329
  f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>' # noqa: E501
330
  )
331
 
332
+ # Highlighted entity with the specified color and style.
333
  highlighted_text += (
334
  f'<span_style="background-color:{entity_color};color:black;'
335
  f'border-radius:3px;font-size:14px;display:inline-block;">'
336
  f"{index_label}{temp_text[start:end]}</span>"
337
  )
338
  prev_end = end
339
+
340
+ # Append any remaining text after the last entity.
341
  highlighted_text += temp_text[prev_end:]
342
+
343
+ # Update the temporary text with the highlighted text.
344
  temp_text = highlighted_text
345
 
346
  if highlighted_text == "":
347
  return text, []
348
+
349
+ # Get the index list of the highlighted text.
350
  highlight_idx_list = get_index_list(highlighted_text)
351
  return highlighted_text, highlight_idx_list
352
 
353
 
354
+ def get_index_list(highlighted_text: str) -> list[int]:
355
  """
356
+ Generates a list of indices of highlighted words within a text.
357
 
358
  Args:
359
+ highlighted_text (str): The text containing highlighted words
360
+ wrapped in HTML-like span tags.
361
 
362
  Returns:
363
+ list: A list of indices corresponding to the highlighted words.
364
+ An empty list if no highlighted words are found.
 
365
  """
366
  highlighted_index = []
367
+ start_index = None
368
+ end_index = None
369
  words = highlighted_text.split()
370
  for index, word in enumerate(words):
371
+ # Check if the word starts with a highlighted word.
372
  if word.startswith("<span_style"):
373
  start_index = index
374
+
375
+ # Check if the word ends with a closing span tag
376
  if word.endswith("</span>"):
377
  end_index = index
378
+ if start_index is not None:
379
+ # Add the range of indices to the result list.
380
+ highlighted_index.extend(
381
+ list(
382
+ range(
383
+ start_index,
384
+ end_index + 1,
385
+ ),
386
+ ),
387
+ )
388
+
389
+ start_index = None
390
+ end_index = None
391
 
392
  return highlighted_index
393
 
394
 
395
+ def extract_entities(text: str):
396
+ """
397
+ Extracts named entities from the given text.
398
+
399
+ Args:
400
+ text (str): The input text to extract entities from.
401
+
402
+ Returns:
403
+ list: A list of unique extracted entities (string).
404
+ """
405
+ # Apply the Named Entity Recognition (NER) pipeline to the input text.
406
  output = ner_pipeline(text)
407
+
408
+ # Extract words from the NER pipeline output.
409
  words = extract_words(output)
410
+
411
+ # Combine subwords into complete words.
412
  words = combine_subwords(words)
413
 
414
+ # Append the entities if it's not a duplicate.
 
415
  entities = []
416
  for entity in words:
417
  if entity not in entities:
 
420
  return entities
421
 
422
 
423
+ def extract_words(entities: list[dict]) -> list[str]:
424
  """
425
  Extracts the words from a list of entities.
426
 
427
  Args:
428
+ entities (list): A list of entities,
429
+ where each entity is expected to be a dictionary
430
+ containing a "word" key.
431
 
432
  Returns:
433
+ list[str]: A list of words extracted from the entities.
434
  """
435
  words = []
436
  for entity in entities:
 
443
  Combines subwords (indicated by "##") with the preceding word in a list.
444
 
445
  Args:
446
+ word_list (list): A list of words,
447
+ where subwords are prefixed with "##".
448
 
449
  Returns:
450
+ list: A new list with subwords combined with their preceding words
451
+ and hyphenated words combined.
452
  """
453
  result = []
454
  i = 0
455
  while i < len(word_list):
456
  if word_list[i].startswith("##"):
457
+ # Remove "##" and append the remaining to the previous word
458
+ result[-1] += word_list[i][2:]
459
+ elif i < len(word_list) - 2 and word_list[i + 1] == "-":
460
+ # Combine the current word, the hyphen, and the next word.
 
 
461
  result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
462
+ i += 2 # Skip the next two words (hyphen and the following word)
463
  else:
464
+ # If neither a subword nor a hyphenated word,
465
+ # append the current word to the result list.
466
  result.append(word_list[i])
467
  i += 1
468
  return result
 
498
  on Thursday, amid wider concerns that the next US President, who is
499
  set to take office on Monday, could potentially reduce aid.
500
  """
501
+
502
  if __name__ == "__main__":
503
  with gr.Blocks() as demo:
504
  gr.Markdown("### Highlight Matching Parts Between Two Texts")
src/application/text/helper.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import re
2
  import string
3
  from collections import Counter
@@ -8,9 +13,18 @@ from nltk.util import ngrams
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
 
11
- def clean_text(text):
12
- """Doc cleaning"""
13
- # exclude , and . due to number
 
 
 
 
 
 
 
 
 
14
  punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
15
 
16
  # Lowering text
@@ -22,23 +36,51 @@ def clean_text(text):
22
  # Removing whitespace and newlines
23
  text = re.sub(r"\s+", " ", text)
24
 
 
25
  text.replace("£", " * ")
26
 
 
27
  words = text.split()
28
- text = " ".join(words[:18]) # Join the first 18 words back into a string
 
 
29
 
30
  return text
31
 
32
 
33
- def remove_punctuation(text):
34
- """Remove punctuation from a given text."""
 
 
 
 
 
 
 
 
 
 
 
35
  punctuation_without_dot = string.punctuation.replace(".", "")
 
 
36
  translator = str.maketrans("", "", punctuation_without_dot)
 
 
37
  return text.translate(translator)
38
 
39
 
40
  def get_keywords(text, num_keywords=5):
41
- """Return top k keywords from a doc using TF-IDF method"""
 
 
 
 
 
 
 
 
 
42
 
43
  # Create a TF-IDF Vectorizer
44
  vectorizer = TfidfVectorizer(stop_words="english")
@@ -142,41 +184,76 @@ def extract_important_phrases(
142
  return important_phrases
143
 
144
 
145
- def extract_equal_text(text1, text2):
146
- def cleanup(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  text = text.lower()
148
  text = text.translate(str.maketrans("", "", string.punctuation))
149
  return text
150
 
 
151
  splited_text1 = cleanup(text1).split()
152
  splited_text2 = cleanup(text2).split()
153
 
 
154
  s = SequenceMatcher(None, splited_text1, splited_text2)
155
 
156
  equal_idx_1 = []
157
  equal_idx_2 = []
 
 
158
  text1 = text1.split()
159
  text2 = text2.split()
160
  for tag, i1, i2, j1, j2 in s.get_opcodes():
161
  if tag == "equal":
 
 
162
  equal_idx_1.append({"start": i1, "end": i2})
163
  equal_idx_2.append({"start": j1, "end": j2})
164
- subtext_1 = " ".join(text1[i1:i2])
165
- subtext_2 = " ".join(text2[j1:j2])
 
166
  # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
167
  # f'{subtext_1!r:>55} --> {subtext_2!r}')
168
  return equal_idx_1, equal_idx_2
169
 
170
 
171
- def connect_consecutive_indexes(nums):
172
  """
173
  Connects consecutive integers in a list.
174
 
175
  Args:
176
- nums: A list of integers.
177
 
178
  Returns:
179
- A list of lists, where each inner list represents a consecutive range.
 
 
180
  """
181
 
182
  if not nums: # Handle empty input
@@ -187,12 +264,15 @@ def connect_consecutive_indexes(nums):
187
  end = nums[0]
188
 
189
  for i in range(1, len(nums)):
 
190
  if nums[i] == end + 1:
191
- end = nums[i]
192
  else:
 
193
  result.append([start, end])
194
  start = nums[i]
195
  end = nums[i]
196
 
197
- result.append([start, end]) # Add the last range
 
198
  return result
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
+
6
  import re
7
  import string
8
  from collections import Counter
 
13
  from sklearn.feature_extraction.text import TfidfVectorizer
14
 
15
 
16
+ def clean_text(text: str) -> str:
17
+ """
18
+ Cleans and preprocesses a given text string.
19
+
20
+ Args:
21
+ text (str): The input text to be cleaned.
22
+
23
+ Returns:
24
+ str: The cleaned and preprocessed text, containing the first 18 words.
25
+ """
26
+ # Define a set of punctuation characters to exclude,
27
+ # exclude comma and period due to numbers
28
  punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
29
 
30
  # Lowering text
 
36
  # Removing whitespace and newlines
37
  text = re.sub(r"\s+", " ", text)
38
 
39
+ # Replace £ with * because Google search doesn't recognize £
40
  text.replace("£", " * ")
41
 
42
+ # Split the text into a list of words.
43
  words = text.split()
44
+
45
+ # Join the first 18 words back into a string
46
+ text = " ".join(words[:18]) # TODO: consider another number
47
 
48
  return text
49
 
50
 
51
+ def remove_punctuation(text: str) -> str:
52
+ """
53
+ Removes all punctuation characters from a string, except for periods (.).
54
+
55
+ Args:
56
+ text (str): The input string.
57
+
58
+ Returns:
59
+ str: The string with all punctuation characters removed,
60
+ except for periods.
61
+ """
62
+ # Create a string containing all punctuation characters,
63
+ # except for periods.
64
  punctuation_without_dot = string.punctuation.replace(".", "")
65
+
66
+ # Create a translation table to remove the specified punctuation chars.
67
  translator = str.maketrans("", "", punctuation_without_dot)
68
+
69
+ # Apply the translation table to the input text and return the result.
70
  return text.translate(translator)
71
 
72
 
73
  def get_keywords(text, num_keywords=5):
74
+ """
75
+ Extracts the top k keywords from a document using the TF-IDF method.
76
+
77
+ Args:
78
+ text (str): The input text from which to extract keywords.
79
+ num_keywords (int, optional): The number of top keywords to return.
80
+
81
+ Returns:
82
+ list: A list of the top keywords extracted from the text.
83
+ """
84
 
85
  # Create a TF-IDF Vectorizer
86
  vectorizer = TfidfVectorizer(stop_words="english")
 
184
  return important_phrases
185
 
186
 
187
+ def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
188
+ """
189
+ Extracts the indices of equal text segments between two strings.
190
+
191
+ Args:
192
+ text1 (str): The first input string.
193
+ text2 (str): The second input string.
194
+
195
+ Returns:
196
+ tuple[
197
+ list[dict{"start": int, "end": int}],
198
+ list[dict{"start": int, "end": int}]
199
+ ]
200
+ - list: the start and end indices of equal segments in text1.
201
+ - list: the start and end indices of equal segments in text2.
202
+ """
203
+
204
+ def cleanup(text: str) -> str:
205
+ """
206
+ Cleans up a text string by converting to lowercase
207
+ and removing punctuation.
208
+
209
+ Args:
210
+ text (str): The input text.
211
+
212
+ Returns:
213
+ str: The cleaned text.
214
+ """
215
  text = text.lower()
216
  text = text.translate(str.maketrans("", "", string.punctuation))
217
  return text
218
 
219
+ # Clean and split the input texts into lists of words.
220
  splited_text1 = cleanup(text1).split()
221
  splited_text2 = cleanup(text2).split()
222
 
223
+ # Create a SequenceMatcher object to compare the cleaned word lists.
224
  s = SequenceMatcher(None, splited_text1, splited_text2)
225
 
226
  equal_idx_1 = []
227
  equal_idx_2 = []
228
+
229
+ # Split the original texts into lists of words (without cleaning).
230
  text1 = text1.split()
231
  text2 = text2.split()
232
  for tag, i1, i2, j1, j2 in s.get_opcodes():
233
  if tag == "equal":
234
+ # Append the start and end indices of the equal segment
235
+ # to the respective lists.
236
  equal_idx_1.append({"start": i1, "end": i2})
237
  equal_idx_2.append({"start": j1, "end": j2})
238
+
239
+ # subtext_1 = " ".join(text1[i1:i2])
240
+ # subtext_2 = " ".join(text2[j1:j2])
241
  # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
242
  # f'{subtext_1!r:>55} --> {subtext_2!r}')
243
  return equal_idx_1, equal_idx_2
244
 
245
 
246
+ def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
247
  """
248
  Connects consecutive integers in a list.
249
 
250
  Args:
251
+ nums (list): A list of integers.
252
 
253
  Returns:
254
+ list: A list of lists,
255
+ where each inner list represents a consecutive range.
256
+ For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
257
  """
258
 
259
  if not nums: # Handle empty input
 
264
  end = nums[0]
265
 
266
  for i in range(1, len(nums)):
267
+ # Check if the current number is consecutive to the previous end.
268
  if nums[i] == end + 1:
269
+ end = nums[i] # Extend the current range.
270
  else:
271
+ # Add the current range to the result and start a new range.
272
  result.append([start, end])
273
  start = nums[i]
274
  end = nums[i]
275
 
276
+ # Add the last range to the result.
277
+ result.append([start, end])
278
  return result
src/application/text/highlight_text.py DELETED
@@ -1,202 +0,0 @@
1
- import colorsys
2
-
3
- import gradio as gr
4
-
5
-
6
- def lighten_color(hex_color, factor=1.8):
7
- """Lightens a HEX color by increasing its brightness in HSV space."""
8
-
9
- hex_color = hex_color.lstrip("#")
10
- r, g, b = (
11
- int(hex_color[0:2], 16),
12
- int(hex_color[2:4], 16),
13
- int(hex_color[4:6], 16),
14
- )
15
-
16
- # Convert to HSV
17
- h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
18
- v = min(1.0, v * factor) # Increase brightness
19
-
20
- # Convert back to HEX
21
- r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
22
- return f"#{r:02x}{g:02x}{b:02x}"
23
-
24
-
25
- def darken_color(hex_color, factor=0.7):
26
- """Darkens a hex color by reducing its brightness in the HSV space."""
27
-
28
- hex_color = hex_color.lstrip("#")
29
- r, g, b = (
30
- int(hex_color[0:2], 16),
31
- int(hex_color[2:4], 16),
32
- int(hex_color[4:6], 16),
33
- )
34
-
35
- # Convert to HSV to adjust brightness
36
- h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
37
- v = max(0, v * factor) # Reduce brightness
38
-
39
- # Convert back to HEX
40
- r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
41
- return f"#{r:02x}{g:02x}{b:02x}"
42
-
43
-
44
- # Generate unique colors for pairs
45
- def generate_color(index, total_colors=20):
46
- """Generates a unique, evenly spaced color for each index using HSL."""
47
-
48
- hue = index / total_colors # Spread hues in range [0,1]
49
- saturation = 0.65 # Keep colors vivid
50
- lightness = 0.75 # Balanced brightness
51
-
52
- # Convert HSL to RGB
53
- r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
54
- r, g, b = int(r * 255), int(g * 255), int(b * 255)
55
-
56
- return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
57
-
58
-
59
- def highlight_pairs(text1, text2):
60
- """Highlight matching pairs between two texts"""
61
- # Predefined matching pairs
62
- match_pairs = [
63
- {
64
- "index": 1,
65
- "text1": "deep learning",
66
- "start1": 13,
67
- "end1": 26,
68
- "text2": "deep learning",
69
- "start2": 12,
70
- "end2": 25,
71
- },
72
- {
73
- "index": 2,
74
- "text1": "neural networks",
75
- "start1": 56,
76
- "end1": 71,
77
- "text2": "neural networks",
78
- "start2": 68,
79
- "end2": 83,
80
- },
81
- {
82
- "index": 3,
83
- "text1": "AI research",
84
- "start1": 86,
85
- "end1": 97,
86
- "text2": "AI research",
87
- "start2": 55,
88
- "end2": 66,
89
- },
90
- ]
91
-
92
- # Assign unique colors to each index
93
- pair_colors = {
94
- pair["index"]: generate_color(
95
- pair["index"],
96
- total_colors=len(match_pairs),
97
- )
98
- for pair in match_pairs
99
- }
100
-
101
- def apply_highlight(
102
- text,
103
- pairs,
104
- key_start,
105
- key_end,
106
- key_index,
107
- pair_colors,
108
- ):
109
- highlighted_text = ""
110
- prev_end = 0
111
-
112
- for pair in sorted(pairs, key=lambda x: x[key_start]):
113
- start, end, index = pair[key_start], pair[key_end], pair[key_index]
114
- color = pair_colors.get(
115
- index,
116
- "#ddd",
117
- ) # Default color if not found
118
- color = lighten_color(
119
- color,
120
- factor=2.2,
121
- ) # Lightened color for background text
122
- label_color = darken_color(
123
- color,
124
- factor=0.7,
125
- ) # Make label color darker
126
-
127
- # Style the index as a label
128
- index_label = (
129
- f'<span style="background-color:{label_color}; color:white; '
130
- f"padding:1px 4px; border-radius:4px; font-size:12px; "
131
- f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>' # noqa: E501
132
- )
133
-
134
- # Append non-highlighted text
135
- highlighted_text += text[prev_end:start]
136
- # Append highlighted text with index label
137
- highlighted_text += (
138
- f'<span style="background-color:{color}; '
139
- f'border-radius:3px; font-size:14px; display:inline-block;">'
140
- f"{index_label} {text[start:end]}</span>"
141
- )
142
- prev_end = end
143
-
144
- # Append remaining text
145
- highlighted_text += text[prev_end:]
146
- return highlighted_text
147
-
148
- # Apply highlighting to both texts using the global MATCH_PAIRS
149
- highlighted_text1 = apply_highlight(
150
- text1,
151
- match_pairs,
152
- "start1",
153
- "end1",
154
- "index",
155
- pair_colors,
156
- )
157
- highlighted_text2 = apply_highlight(
158
- text2,
159
- match_pairs,
160
- "start2",
161
- "end2",
162
- "index",
163
- pair_colors,
164
- )
165
-
166
- return highlighted_text1, highlighted_text2
167
-
168
-
169
- if __name__ == "__main__":
170
- # Create Gradio Interface
171
- text1 = ""
172
-
173
- with gr.Blocks() as demo:
174
- gr.Markdown("### Highlight Matching Parts Between Two texts")
175
- text1_input = gr.Textbox(
176
- label="Text 1",
177
- lines=5,
178
- value="""
179
- The field of deep learning is advancing rapidly.
180
- Modern neural networks are improving AI research significantly.
181
- """,
182
- )
183
- text2_input = gr.Textbox(
184
- label="Text 2",
185
- lines=5,
186
- value="""
187
- Advances in deep learning have led to breakthroughs in AI research.
188
- Neural networks are at the core of these innovations",
189
- """,
190
- )
191
- output1 = gr.HTML()
192
- output2 = gr.HTML()
193
- submit_button = gr.Button("Highlight Matches")
194
-
195
- submit_button.click(
196
- fn=highlight_pairs,
197
- inputs=[text1_input, text2_input],
198
- outputs=[output1, output2],
199
- )
200
-
201
- # Launch the Gradio app
202
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/application/text/model_detection.py CHANGED
@@ -1,44 +1,24 @@
1
- import os
 
 
 
2
 
3
- import torch
4
- from dotenv import load_dotenv
5
- from openai import (
6
- AzureOpenAI,
7
- OpenAIError,
8
- )
9
- from sentence_transformers import (
10
- SentenceTransformer,
11
- util,
12
- )
13
  from transformers import pipeline
14
 
15
- load_dotenv()
16
- AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
17
- AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
18
- AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
19
-
20
- azure_client = AzureOpenAI(
21
- azure_endpoint="https://quoc-nguyen.openai.azure.com/",
22
- api_key=AZURE_OPENAI_API_KEY,
23
- api_version="2024-05-01-preview",
 
24
  )
25
 
26
- # TODO: move to a config file
27
- # AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
28
- AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
29
-
30
- MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
31
- HUMAN = "HUMAN"
32
- MACHINE = "MACHINE"
33
- UNKNOWN = "UNKNOWN"
34
- PARAPHRASE = "PARAPHRASE"
35
- NON_PARAPHRASE = "NON_PARAPHRASE"
36
-
37
- # load the embedding model
38
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
- PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
40
- PARAPHASE_MODEL.to(DEVICE)
41
-
42
 
43
  def detect_text_by_ai_model(
44
  input_text: str,
@@ -51,29 +31,43 @@ def detect_text_by_ai_model(
51
 
52
  Detects if text is human or machine generated.
53
 
 
 
 
 
 
54
  Returns:
55
  tuple: (label, confidence_score)
56
  where label is HUMAN or MACHINE.
57
  """
58
  try:
 
59
  pipe = pipeline(
60
  "text-classification",
61
  model=model,
62
  tokenizer=model,
63
- max_length=max_length,
64
  truncation=True,
65
  device_map="auto", # good for GPU usage
66
  )
 
 
67
  input_text = input_text.replace("<br>", " ")
 
 
68
  result = pipe(input_text)[0]
69
  confidence_score = result["score"]
 
 
70
  if result["label"] == MODEL_HUMAN_LABEL[model]:
71
  label = HUMAN
72
  else:
73
  label = MACHINE
74
  generated_model, _ = predict_generation_model(input_text)
75
  label += f"<br>({generated_model})"
 
76
  return label, confidence_score
 
77
  except Exception as e: # Add exception handling
78
  print(f"Error in Roberta model inference: {e}")
79
  return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
@@ -82,20 +76,31 @@ def detect_text_by_ai_model(
82
  def predict_generation_model(text: str) -> tuple[str, float]:
83
  """
84
  Predicts if text is generated by gpt-4o or gpt-4o-mini models.
85
- Compare the input text against the paraphrased text by the models.
 
 
 
86
 
87
  Returns:
88
  tuple: (label, confidence_score)
89
- where label is gpt-4o or gpt-4o-mini.
 
90
  """
91
  best_similarity = 0
92
- best_model = "gpt-4o"
93
- models = ["gpt-4o", "gpt-4o-mini"]
94
- for model in models:
 
95
  paraphrased_text = paraphrase_by_AI(text, model)
 
 
96
  if paraphrased_text is None:
97
  continue
 
 
98
  similarity = measure_text_similarity(text, paraphrased_text)
 
 
99
  if similarity > best_similarity:
100
  best_similarity = similarity
101
  best_model = model
@@ -105,10 +110,14 @@ def predict_generation_model(text: str) -> tuple[str, float]:
105
 
106
  def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
107
  """
108
- Paraphrase text using a given model.
 
 
 
 
109
 
110
  Returns:
111
- str: Paraphrased text.
112
  """
113
 
114
  prompt = f"""
@@ -116,18 +125,19 @@ Paraphrase the following news, only output the paraphrased text:
116
  {input_text}
117
  """
118
  try:
119
- response = azure_client.chat.completions.create(
120
  model=model,
121
  messages=[
122
  {"role": "user", "content": prompt},
123
  ],
124
- # max_tokens=100,
125
- # temperature=0.7,
126
- # top_p=0.9,
127
- # n=1,
128
  )
129
  paraphrased_text = response.choices[0].message.content
130
  return paraphrased_text
 
131
  except OpenAIError as e: # Add exception handling
132
  print(f"Error in AI model inference: {e}")
133
  return None
@@ -135,18 +145,24 @@ Paraphrase the following news, only output the paraphrased text:
135
 
136
  def measure_text_similarity(text1: str, text2: str) -> float:
137
  """
138
- Measure the similarity between two texts.
 
 
 
 
 
139
 
140
  Returns:
141
- float: Similarity score.
142
  """
143
- embeddings1 = PARAPHASE_MODEL.encode(
 
144
  text1,
145
  convert_to_tensor=True,
146
  device=DEVICE,
147
  show_progress_bar=False,
148
  )
149
- embeddings2 = PARAPHASE_MODEL.encode(
150
  text2,
151
  convert_to_tensor=True,
152
  device=DEVICE,
@@ -155,5 +171,4 @@ def measure_text_similarity(text1: str, text2: str) -> float:
155
 
156
  # Compute cosine similarity matrix
157
  similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
158
- print(similarity[0][0])
159
  return similarity[0][0]
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
 
6
+ from openai import OpenAIError
7
+ from sentence_transformers import util
 
 
 
 
 
 
 
 
8
  from transformers import pipeline
9
 
10
+ from src.application.config import (
11
+ AI_TEXT_DECTECTION_MODEL,
12
+ AZUREOPENAI_CLIENT,
13
+ DEVICE,
14
+ GPT_PARAPHRASE_MODELS,
15
+ HUMAN,
16
+ MACHINE,
17
+ MODEL_HUMAN_LABEL,
18
+ PARAPHRASE_MODEL,
19
+ UNKNOWN,
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def detect_text_by_ai_model(
24
  input_text: str,
 
31
 
32
  Detects if text is human or machine generated.
33
 
34
+ Args:
35
+ input_text (str): The text to be classified.
36
+ model (str, optional): The name of the AI text detection model.
37
+ max_length (int, optional): The maximum length of the input text.
38
+
39
  Returns:
40
  tuple: (label, confidence_score)
41
  where label is HUMAN or MACHINE.
42
  """
43
  try:
44
+ # Create a text classification pipeline using the specified model.
45
  pipe = pipeline(
46
  "text-classification",
47
  model=model,
48
  tokenizer=model,
49
+ max_length=max_length, # TODO: consider: removal
50
  truncation=True,
51
  device_map="auto", # good for GPU usage
52
  )
53
+
54
+ # Replace HTML line breaks with spaces to improve processing.
55
  input_text = input_text.replace("<br>", " ")
56
+
57
+ # Perform text classification using the pipeline.
58
  result = pipe(input_text)[0]
59
  confidence_score = result["score"]
60
+
61
+ # Determine the label based on the model's prediction.
62
  if result["label"] == MODEL_HUMAN_LABEL[model]:
63
  label = HUMAN
64
  else:
65
  label = MACHINE
66
  generated_model, _ = predict_generation_model(input_text)
67
  label += f"<br>({generated_model})"
68
+
69
  return label, confidence_score
70
+
71
  except Exception as e: # Add exception handling
72
  print(f"Error in Roberta model inference: {e}")
73
  return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
 
76
  def predict_generation_model(text: str) -> tuple[str, float]:
77
  """
78
  Predicts if text is generated by gpt-4o or gpt-4o-mini models.
79
+ Compares the input text against the paraphrased text by the models.
80
+
81
+ Args:
82
+ text (str): The input text to be analyzed.
83
 
84
  Returns:
85
  tuple: (label, confidence_score)
86
+ where label is gpt-4o or gpt-4o-mini,
87
+ and confidence_score is the highest similarity.
88
  """
89
  best_similarity = 0
90
+ best_model = GPT_PARAPHRASE_MODELS[0]
91
+
92
+ for model in GPT_PARAPHRASE_MODELS:
93
+ # Generate paraphrased text using the current model.
94
  paraphrased_text = paraphrase_by_AI(text, model)
95
+
96
+ # Skip to the next model if paraphrasing fails (returns None).
97
  if paraphrased_text is None:
98
  continue
99
+
100
+ # Similarity between the original text and the paraphrased text.
101
  similarity = measure_text_similarity(text, paraphrased_text)
102
+
103
+ # Update the best similarity
104
  if similarity > best_similarity:
105
  best_similarity = similarity
106
  best_model = model
 
110
 
111
  def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
112
  """
113
+ Paraphrases text using a given AI model.
114
+
115
+ Args:
116
+ input_text (str): The text to be paraphrased.
117
+ model (str, optional): The AI model to use for paraphrasing.
118
 
119
  Returns:
120
+ str: The paraphrased text, or None if an error occurs.
121
  """
122
 
123
  prompt = f"""
 
125
  {input_text}
126
  """
127
  try:
128
+ response = AZUREOPENAI_CLIENT.chat.completions.create(
129
  model=model,
130
  messages=[
131
  {"role": "user", "content": prompt},
132
  ],
133
+ # max_tokens=100, # Limit the number of tokens in the response.
134
+ # temperature=0.7, # Control the randomness of the response.
135
+ # top_p=0.9, # Control the nucleus sampling.
136
+ # n=1, # Generate multiple responses.
137
  )
138
  paraphrased_text = response.choices[0].message.content
139
  return paraphrased_text
140
+
141
  except OpenAIError as e: # Add exception handling
142
  print(f"Error in AI model inference: {e}")
143
  return None
 
145
 
146
  def measure_text_similarity(text1: str, text2: str) -> float:
147
  """
148
+ Measures the similarity between two texts
149
+ using cosine similarity of their sentence embeddings.
150
+
151
+ Args:
152
+ text1 (str): The first text string.
153
+ text2 (str): The second text string.
154
 
155
  Returns:
156
+ float: The cosine similarity score between the two texts.
157
  """
158
+ # Generate sentence embeddings
159
+ embeddings1 = PARAPHRASE_MODEL.encode(
160
  text1,
161
  convert_to_tensor=True,
162
  device=DEVICE,
163
  show_progress_bar=False,
164
  )
165
+ embeddings2 = PARAPHRASE_MODEL.encode(
166
  text2,
167
  convert_to_tensor=True,
168
  device=DEVICE,
 
171
 
172
  # Compute cosine similarity matrix
173
  similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
 
174
  return similarity[0][0]
src/application/text/preprocessing.py CHANGED
@@ -1,46 +1,67 @@
 
 
 
 
 
1
  from nltk.tokenize import sent_tokenize
2
 
3
 
4
- def split_into_sentences(input_text):
 
5
  """
6
- Splits input text into sentences by newlines.
 
7
 
8
  Args:
9
- input_text: The input text as a string.
10
 
11
  Returns:
12
- A list of sentences. Returns an empty list if input is not valid.
 
13
  """
14
  if not isinstance(input_text, str):
15
  return []
16
 
 
 
17
  paragraphs = input_text.splitlines(keepends=True)
18
  sentences = []
19
  for paragraph in paragraphs:
 
20
  paragraph = paragraph.strip()
 
21
  if paragraph and paragraph != "\n":
 
22
  sentences.extend(sent_tokenize(paragraph))
 
23
  return sentences
24
 
25
 
26
- def split_into_paragraphs(input_text):
27
  """
28
- Splits input text into sentences by newlines.
29
 
30
  Args:
31
- input_text: The input text as a string.
32
 
33
  Returns:
34
- A list of sentences. Returns an empty list if input is not valid.
 
35
  """
36
  if not isinstance(input_text, str):
37
  return []
38
 
 
 
39
  paragraphs = input_text.splitlines(keepends=True)
40
  out_paragraphs = []
 
41
  for paragraph in paragraphs:
 
42
  paragraph = paragraph.strip()
 
43
  if paragraph and paragraph != "\n":
 
44
  out_paragraphs.append(paragraph)
45
- print(f"paragraphs: {out_paragraphs}")
46
- return out_paragraphs
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
+
6
  from nltk.tokenize import sent_tokenize
7
 
8
 
9
+ # TODO: consider moving to helpers
10
+ def split_into_sentences(input_text: str) -> list[str]:
11
  """
12
+ Splits input text into sentences by newlines
13
+ and then tokenizes each paragraph into sentences.
14
 
15
  Args:
16
+ input_text (str): The input text as a string.
17
 
18
  Returns:
19
+ list: A list of sentences.
20
+ Returns an empty list if input is not a string.
21
  """
22
  if not isinstance(input_text, str):
23
  return []
24
 
25
+ # Split the input text into paragraphs based on newline characters,
26
+ # keeping the newline characters.
27
  paragraphs = input_text.splitlines(keepends=True)
28
  sentences = []
29
  for paragraph in paragraphs:
30
+ # Remove leading/trailing whitespace
31
  paragraph = paragraph.strip()
32
+
33
  if paragraph and paragraph != "\n":
34
+ # Tokenize the paragraph into sentences
35
  sentences.extend(sent_tokenize(paragraph))
36
+
37
  return sentences
38
 
39
 
40
+ def split_into_paragraphs(input_text: str) -> list[str]:
41
  """
42
+ Splits input text into paragraphs based on newline characters.
43
 
44
  Args:
45
+ input_text (str): The input text as a string.
46
 
47
  Returns:
48
+ list: A list of paragraphs.
49
+ Returns an empty list if input is not a string.
50
  """
51
  if not isinstance(input_text, str):
52
  return []
53
 
54
+ # Split the input text into paragraphs based on newline characters,
55
+ # keeping the newline characters.
56
  paragraphs = input_text.splitlines(keepends=True)
57
  out_paragraphs = []
58
+
59
  for paragraph in paragraphs:
60
+ # Remove leading/trailing whitespace
61
  paragraph = paragraph.strip()
62
+
63
  if paragraph and paragraph != "\n":
64
+ # Append the cleaned paragraph to the output list.
65
  out_paragraphs.append(paragraph)
66
+
67
+ return out_paragraphs
src/application/text/search.py CHANGED
@@ -1,38 +1,50 @@
1
- import os
 
 
 
 
2
  import string
3
  from collections import Counter
4
 
5
  import requests
6
- from dotenv import load_dotenv
7
  from nltk.corpus import stopwords
8
  from nltk.tokenize import word_tokenize
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  from src.application.text.entity import extract_entities
12
 
13
- load_dotenv()
14
- GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
15
- SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
16
-
17
 
18
  def search_by_google(
19
  query,
20
- num_results=10,
21
  is_exact_terms=False,
22
  ) -> dict:
23
  """
24
- Searches the Google Custom Search Engine for the given query.
25
 
26
  Args:
27
- query: The search query.
28
- is_exact_terms: Whether to use exact terms search (True) or not.
29
- num_results: The number of results to return (default: 10).
 
 
30
 
31
  Returns:
32
- A dict containing the search results or None if there was an error.
 
33
  """
34
 
35
- url = "https://www.googleapis.com/customsearch/v1"
36
  params = {
37
  "key": GOOGLE_API_KEY,
38
  "cx": SEARCH_ENGINE_ID,
@@ -43,7 +55,7 @@ def search_by_google(
43
  else:
44
  params["q"] = query.replace('"', "")
45
 
46
- response = requests.get(url, params=params)
47
  if response.status_code == 200:
48
  return response.json()
49
  else:
@@ -51,26 +63,35 @@ def search_by_google(
51
  return None
52
 
53
 
54
- def get_most_frequent_words(input_text, number_word=32):
 
 
 
55
  """
56
- Gets the top words from the input text,
57
- excluding stop words and punctuation.
58
 
59
  Args:
60
- input_text: The input text as a string.
61
- number_word: The number of top words to return.
62
 
63
  Returns:
64
- A list of tuples, where each tuple contains a word and its frequency.
65
- Returns an empty list if input is not a string or is empty.
66
  """
 
67
  if not isinstance(input_text, str) or not input_text:
68
- return []
 
 
 
69
 
70
- words = word_tokenize(input_text.lower()) # Tokenize and lowercase
 
71
 
72
- stop_words = set(stopwords.words("english"))
73
- punctuation = set(string.punctuation) # get all punctuation
 
 
74
  filtered_words = [
75
  word
76
  for word in words
@@ -78,32 +99,40 @@ def get_most_frequent_words(input_text, number_word=32):
78
  and word not in stop_words
79
  and word not in punctuation
80
  ]
 
 
81
  word_frequencies = Counter(filtered_words)
 
 
82
  top_words = word_frequencies.most_common(number_word)
83
 
84
  for top_word in top_words:
85
  words.append(top_word[0])
86
 
87
- if len(words) > 32:
88
- search_phrase = " ".join(words[:32])
 
89
  else:
90
  search_phrase = " ".join(words[:number_word])
91
 
92
  return search_phrase
93
 
94
 
95
- def get_chunk(input_text, chunk_length=32, num_chunk=3):
 
 
 
 
96
  """
97
- Splits the input text into chunks of a specified length.
98
 
99
  Args:
100
- input_text: The input text as a string.
101
- num_chunk: The maximum number of chunks to create.
102
- chunk_length: The desired length of each chunk (in words).
103
 
104
  Returns:
105
- A list of string chunks.
106
- Returns an empty list if input is invalid.
107
  """
108
  if not isinstance(input_text, str):
109
  return []
@@ -112,8 +141,11 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
112
  input_words = input_text.split() # Split by any whitespace
113
 
114
  for i in range(num_chunk):
115
- start_index = i * chunk_length
116
- end_index = (i + 1) * chunk_length
 
 
 
117
  chunk = " ".join(input_words[start_index:end_index])
118
  if chunk: # Only append non-empty chunks
119
  chunks.append(chunk)
@@ -121,11 +153,20 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
121
  return chunks
122
 
123
 
124
- def get_keywords(text, num_keywords=5):
125
- """Return top k keywords from a doc using TF-IDF method"""
 
 
 
 
 
126
 
 
 
 
 
127
  # Create a TF-IDF Vectorizer
128
- vectorizer = TfidfVectorizer(stop_words="english")
129
 
130
  # Fit and transform the text
131
  tfidf_matrix = vectorizer.fit_transform([text])
@@ -144,7 +185,7 @@ def get_keywords(text, num_keywords=5):
144
  return [word for word, score in word_scores[:num_keywords]]
145
 
146
 
147
- def generate_search_phrases(input_text):
148
  """
149
  Generates different types of phrases for search purposes.
150
 
@@ -156,6 +197,7 @@ def generate_search_phrases(input_text):
156
  - A list of most frequent words.
157
  - The original input text.
158
  - A list of text chunks.
 
159
  """
160
  if not isinstance(input_text, str):
161
  return []
@@ -171,7 +213,7 @@ def generate_search_phrases(input_text):
171
  # Method 3: Split text by chunks
172
  search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
173
 
174
- # Method 4: Get most identities and key words
175
  entities = extract_entities(input_text)
176
  text_without_entities = remove_identities_from_text(input_text, entities)
177
  search_phrases.append(text_without_entities)
@@ -182,7 +224,7 @@ def generate_search_phrases(input_text):
182
  return search_phrases
183
 
184
 
185
- def remove_identities_from_text(input_text, entities):
186
  """
187
  Removes entities from the input text.
188
 
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
+
6
  import string
7
  from collections import Counter
8
 
9
  import requests
 
10
  from nltk.corpus import stopwords
11
  from nltk.tokenize import word_tokenize
12
  from sklearn.feature_extraction.text import TfidfVectorizer
13
 
14
+ from src.application.config import (
15
+ CHUNK_SIZE,
16
+ GOOGLE_API_KEY,
17
+ GOOGLE_ENDPOINT_URL,
18
+ NUM_CHUNKS,
19
+ NUM_FREQUENT_WORDS,
20
+ NUM_KEYWORDS,
21
+ SEARCH_ENGINE_ID,
22
+ STOPWORDS_LANG,
23
+ TOP_SEARCH_RESUTLS,
24
+ )
25
  from src.application.text.entity import extract_entities
26
 
 
 
 
 
27
 
28
  def search_by_google(
29
  query,
30
+ num_results=TOP_SEARCH_RESUTLS,
31
  is_exact_terms=False,
32
  ) -> dict:
33
  """
34
+ Performs a Google Custom Search API query.
35
 
36
  Args:
37
+ query (str): The search query string.
38
+ num_results (int, optional): The number of search results to return.
39
+ Defaults to TOP_SEARCH_RESUTLS.
40
+ is_exact_terms (bool, optional): use an exact phrase search or not.
41
+ Defaults to False.
42
 
43
  Returns:
44
+ dict: JSON response from the Google Custom Search API,
45
+ None if an error occurs.
46
  """
47
 
 
48
  params = {
49
  "key": GOOGLE_API_KEY,
50
  "cx": SEARCH_ENGINE_ID,
 
55
  else:
56
  params["q"] = query.replace('"', "")
57
 
58
+ response = requests.get(GOOGLE_ENDPOINT_URL, params=params)
59
  if response.status_code == 200:
60
  return response.json()
61
  else:
 
63
  return None
64
 
65
 
66
+ def get_most_frequent_words(
67
+ input_text: str,
68
+ number_word: int = NUM_FREQUENT_WORDS,
69
+ ) -> str:
70
  """
71
+ Extracts the most frequent words from the input text
72
+ and forms a search phrase.
73
 
74
  Args:
75
+ input_text (str): The text from which to extract frequent words.
76
+ number_word (int, optional): The number of frequent words to extract.
77
 
78
  Returns:
79
+ str: A search phrase consisting of the most frequent words.
 
80
  """
81
+ # Check if the input text is valid
82
  if not isinstance(input_text, str) or not input_text:
83
+ return None
84
+
85
+ # Tokenize the input text into words and convert to lowercase
86
+ words = word_tokenize(input_text.lower())
87
 
88
+ # Get the set of stop words for the specified language
89
+ stop_words = set(stopwords.words(STOPWORDS_LANG))
90
 
91
+ # Get the set of punctuation characters
92
+ punctuation = set(string.punctuation)
93
+
94
+ # Filter out stop words, punctuation, and non-alphanumeric words
95
  filtered_words = [
96
  word
97
  for word in words
 
99
  and word not in stop_words
100
  and word not in punctuation
101
  ]
102
+
103
+ # Count the frequency of each filtered word
104
  word_frequencies = Counter(filtered_words)
105
+
106
+ # Get the most common words and their frequencies
107
  top_words = word_frequencies.most_common(number_word)
108
 
109
  for top_word in top_words:
110
  words.append(top_word[0])
111
 
112
+ # Construct the search phrase
113
+ if len(words) > NUM_FREQUENT_WORDS:
114
+ search_phrase = " ".join(words[:NUM_FREQUENT_WORDS])
115
  else:
116
  search_phrase = " ".join(words[:number_word])
117
 
118
  return search_phrase
119
 
120
 
121
+ def get_chunk(
122
+ input_text: str,
123
+ chunk_size: int = CHUNK_SIZE,
124
+ num_chunk: int = NUM_CHUNKS,
125
+ ) -> list[str]:
126
  """
127
+ Splits the input text into chunks of a specified size.
128
 
129
  Args:
130
+ input_text (str): The text to be chunked.
131
+ chunk_size (int, optional): The number of words per chunk.
132
+ num_chunk (int, optional): The number of chunks to generate.
133
 
134
  Returns:
135
+ list: A list of chunks of the input text.
 
136
  """
137
  if not isinstance(input_text, str):
138
  return []
 
141
  input_words = input_text.split() # Split by any whitespace
142
 
143
  for i in range(num_chunk):
144
+ # Calculate the start and end indices for the current chunk
145
+ start_index = i * chunk_size
146
+ end_index = (i + 1) * chunk_size
147
+
148
+ # Extract the words for the current chunk and join them into a string
149
  chunk = " ".join(input_words[start_index:end_index])
150
  if chunk: # Only append non-empty chunks
151
  chunks.append(chunk)
 
153
  return chunks
154
 
155
 
156
+ def get_keywords(text: str, num_keywords: int = NUM_KEYWORDS) -> list[str]:
157
+ """
158
+ Extracts the top keywords from a given text using the TF-IDF method.
159
+
160
+ Args:
161
+ text (str): The input text from which to extract keywords.
162
+ num_keywords (int, optional): The number of top keywords to return.
163
 
164
+ Returns:
165
+ list: A list of strings representing the top keywords extracted
166
+ from the text.
167
+ """
168
  # Create a TF-IDF Vectorizer
169
+ vectorizer = TfidfVectorizer(stop_words=STOPWORDS_LANG)
170
 
171
  # Fit and transform the text
172
  tfidf_matrix = vectorizer.fit_transform([text])
 
185
  return [word for word, score in word_scores[:num_keywords]]
186
 
187
 
188
+ def generate_search_phrases(input_text: str) -> list[str]:
189
  """
190
  Generates different types of phrases for search purposes.
191
 
 
197
  - A list of most frequent words.
198
  - The original input text.
199
  - A list of text chunks.
200
+ - A text without entities.
201
  """
202
  if not isinstance(input_text, str):
203
  return []
 
213
  # Method 3: Split text by chunks
214
  search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
215
 
216
+ # Method 4: Remove identities and key words
217
  entities = extract_entities(input_text)
218
  text_without_entities = remove_identities_from_text(input_text, entities)
219
  search_phrases.append(text_without_entities)
 
224
  return search_phrases
225
 
226
 
227
+ def remove_identities_from_text(input_text: str, entities: list[str]) -> str:
228
  """
229
  Removes entities from the input text.
230
 
src/application/text/search_detection.py CHANGED
@@ -1,14 +1,22 @@
 
 
 
 
 
1
  import warnings
2
- from difflib import SequenceMatcher
3
 
4
- import nltk
5
  import numpy as np
6
- import torch
7
- from sentence_transformers import (
8
- SentenceTransformer,
9
- util,
 
 
 
 
 
 
10
  )
11
-
12
  from src.application.text.preprocessing import split_into_sentences
13
  from src.application.text.search import (
14
  generate_search_phrases,
@@ -18,39 +26,43 @@ from src.application.url_reader import URLReader
18
 
19
  warnings.simplefilter(action="ignore", category=FutureWarning)
20
 
21
- # Download necessary NLTK data files
22
- nltk.download("punkt", quiet=True)
23
- nltk.download("punkt_tab", quiet=True)
24
- nltk.download("stopwords", quiet=True)
25
-
26
- # load the model
27
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
- PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
29
- PARAPHASE_MODEL.to(DEVICE)
30
-
31
- PARAPHRASE_THRESHOLD_HUMAN = 0.963
32
- PARAPHRASE_THRESHOLD_MACHINE = 0.8
33
- PARAPHRASE_THRESHOLD = 0.8
34
-
35
- MIN_SAME_SENTENCE_LEN = 6
36
- MIN_PHRASE_SENTENCE_LEN = 10
37
- MIN_RATIO_PARAPHRASE_NUM = 0.5
38
- MAX_CHAR_SIZE = 30000
39
 
 
 
 
 
 
 
 
 
40
 
41
- def find_sentence_source(text, text_index, sentences_df):
 
 
 
42
 
43
- checked_urls = set()
 
 
 
 
 
 
 
44
  searched_phrases = generate_search_phrases(text[text_index])
45
 
46
  for candidate in searched_phrases:
 
47
  search_results = search_by_google(candidate)
 
 
48
  urls = [item["link"] for item in search_results.get("items", [])]
49
 
50
- for url in urls[:3]:
51
- if url in checked_urls: # visited url
 
52
  continue
53
- if "bbc.com" not in url:
54
  continue
55
 
56
  checked_urls.add(url)
@@ -96,13 +108,13 @@ def find_sentence_source(text, text_index, sentences_df):
96
  if c in sentences_df.columns:
97
  sentences_df.loc[text_index, c] = aligned_sentence[c]
98
 
 
99
  for idx, _ in sentences_df.iterrows():
100
  similarity = sentences_df.loc[idx, "similarity"]
101
  if similarity is not None:
102
  if similarity > PARAPHRASE_THRESHOLD_MACHINE:
103
  continue
104
 
105
- # find matched content in new url
106
  aligned_sentence = check_paraphrase(
107
  text[idx],
108
  source_text,
@@ -125,141 +137,56 @@ def find_sentence_source(text, text_index, sentences_df):
125
  sentences_df.loc[idx, c] = aligned_sentence[c]
126
  return sentences_df, content.images
127
 
 
128
  sentences_df.loc[text_index, "input"] = text[text_index]
129
  return sentences_df, []
130
 
131
 
132
- def longest_common_subsequence(arr1, arr2):
133
- """
134
- Finds the length of the longest common subsequence (contiguous) between
135
- two arrays.
136
-
137
- Args:
138
- arr1: The first array.
139
- arr2: The second array.
140
-
141
- Returns:
142
- The length of the longest common subsequence.
143
- Returns 0 if either input is invalid.
144
- """
145
-
146
- if not isinstance(arr1, list) or not isinstance(arr2, list):
147
- return 0
148
-
149
- n = len(arr1)
150
- m = len(arr2)
151
-
152
- if n == 0 or m == 0: # handle empty list
153
- return 0
154
-
155
- # Create table dp with size (n+1) x (m+1)
156
- dp = [[0] * (m + 1) for _ in range(n + 1)]
157
- max_length = 0
158
-
159
- for i in range(1, n + 1):
160
- for j in range(1, m + 1):
161
- if arr1[i - 1] == arr2[j - 1]:
162
- dp[i][j] = dp[i - 1][j - 1] + 1
163
- max_length = max(max_length, dp[i][j])
164
- else:
165
- dp[i][j] = 0 # set 0 since the array must be consecutive
166
-
167
- return max_length
168
-
169
-
170
- def check_sentence(
171
- input_sentence,
172
- source_sentence,
173
- min_same_sentence_len,
174
- min_phrase_sentence_len,
175
- verbose=False,
176
- ):
177
- """
178
- Checks if two sentences are similar based on exact match or
179
- longest common subsequence.
180
-
181
- Args:
182
- input_sentence: The input sentence.
183
- source_sentence: The source sentence.
184
- min_same_sentence_len: Minimum length for exact sentence match.
185
- min_phrase_sentence_len: Minimum length for common subsequence match.
186
- verbose: If True, print debug information.
187
-
188
- Returns:
189
- True if the sentences are considered similar, False otherwise.
190
- Returns False if input is not valid.
191
- """
192
-
193
- if not isinstance(input_sentence, str) or not isinstance(
194
- source_sentence,
195
- str,
196
- ):
197
- return False
198
-
199
- input_sentence = input_sentence.strip()
200
- source_sentence = source_sentence.strip()
201
-
202
- if not input_sentence or not source_sentence: # handle empty string
203
- return False
204
-
205
- input_words = input_sentence.split() # split without arguments
206
- source_words = source_sentence.split() # split without arguments
207
-
208
- if (
209
- input_sentence == source_sentence
210
- and len(input_words) >= min_same_sentence_len
211
- ):
212
- if verbose:
213
- print("Exact match found.")
214
- return True
215
-
216
- max_overlap_len = longest_common_subsequence(input_words, source_words)
217
- if verbose:
218
- print(f"Max overlap length: {max_overlap_len}") # print overlap length
219
- if max_overlap_len >= min_phrase_sentence_len:
220
- return True
221
-
222
- return False
223
-
224
-
225
- def check_paraphrase(input_text, source_text, url):
226
  """
227
- Checks if the input text is paraphrased in the content at the given URL.
 
228
 
229
  Args:
230
- input_text: The text to check for paraphrase.
231
- page_text: The text of the web page to compare with.
232
- url
233
 
234
  Returns:
235
- A tuple containing:
236
-
 
 
 
 
 
237
  """
238
-
239
  # Extract sentences from input text and web page
240
  input_sentences = split_into_sentences(input_text)
241
 
242
  if not source_text:
243
  return {}
244
-
245
  source_sentences = split_into_sentences(source_text)
 
246
  if not input_sentences or not source_sentences:
247
  return {}
248
 
 
 
249
  additional_sentences = []
250
  for sentence in source_sentences:
251
  if ", external" in sentence:
252
  additional_sentences.append(sentence.replace(", external", ""))
253
  source_sentences.extend(additional_sentences)
254
 
255
- # Encode sentences into embeddings
256
- embeddings1 = PARAPHASE_MODEL.encode(
257
  input_sentences,
258
  convert_to_tensor=True,
259
  device=DEVICE,
260
  show_progress_bar=False,
261
  )
262
- embeddings2 = PARAPHASE_MODEL.encode(
263
  source_sentences,
264
  convert_to_tensor=True,
265
  device=DEVICE,
@@ -272,78 +199,53 @@ def check_paraphrase(input_text, source_text, url):
272
  # Find sentence alignments
273
  inputs = ""
274
  sources = ""
275
- similarities = []
276
-
277
  for i, sentence in enumerate(input_sentences):
278
  max_sim_index = np.argmax(similarity_matrix[i])
279
  max_similarity = similarity_matrix[i][max_sim_index]
280
  best_matched_sentence = source_sentences[max_sim_index]
281
-
282
  inputs += sentence + " "
283
  sources += best_matched_sentence + " "
284
  similarities.append(max_similarity)
285
 
286
-
287
  similarity = sum(similarities) / len(similarities)
288
  label, is_paraphrased = determine_label(max_similarity)
 
 
289
  alignment = {
290
- "input": inputs,
291
- "source": sources,
292
- "similarity": similarity,
293
- "label": label,
294
- "paraphrase": is_paraphrased,
295
- "url": url,
296
- }
 
297
  print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
298
 
299
  return alignment
300
 
301
 
302
- def similarity_ratio(a, b):
303
  """
304
- Calculates the similarity ratio between two strings using SequenceMatcher.
305
 
306
  Args:
307
- a: The first string.
308
- b: The second string.
309
-
310
- Returns:
311
- A float representing the similarity ratio between 0.0 and 1.0.
312
- Returns 0.0 if either input is None or not a string.
313
- """
314
- if (
315
- not isinstance(a, str)
316
- or not isinstance(b, str)
317
- or a is None
318
- or b is None
319
- ):
320
- return 0.0 # Handle cases where inputs are not strings or None
321
- return SequenceMatcher(None, a, b).ratio()
322
-
323
-
324
- def check_human(alligned_sentences):
325
- """
326
- Checks if a sufficient number of input sentences are found within
327
- source sentences.
328
 
329
  Returns:
330
- bool: True if the condition is met, False otherwise.
 
331
  """
332
- if not alligned_sentences: # Handle empty data case
333
- return False
334
-
335
- if alligned_sentences["similarity"] >= 0.99:
336
- return True
337
- return False
338
-
339
-
340
- def determine_label(similarity):
341
  if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
342
- return "HUMAN", True
343
  elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
344
- return "MACHINE", True
345
  else:
346
- return None, False
347
 
348
 
349
  if __name__ == "__main__":
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
+
6
  import warnings
 
7
 
 
8
  import numpy as np
9
+ from pandas import DataFrame
10
+ from sentence_transformers import util
11
+
12
+ from src.application.config import (
13
+ DEVICE,
14
+ MAX_CHAR_SIZE,
15
+ PARAPHRASE_MODEL,
16
+ PARAPHRASE_THRESHOLD_HUMAN,
17
+ PARAPHRASE_THRESHOLD_MACHINE,
18
+ TOP_URLS_PER_SEARCH,
19
  )
 
20
  from src.application.text.preprocessing import split_into_sentences
21
  from src.application.text.search import (
22
  generate_search_phrases,
 
26
 
27
  warnings.simplefilter(action="ignore", category=FutureWarning)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ def find_sentence_source(
31
+ text: list,
32
+ text_index: str,
33
+ sentences_df: DataFrame,
34
+ ) -> tuple[DataFrame, list]:
35
+ """
36
+ Finds the source URL for a given sentence by searching Google
37
+ and checking for paraphrases.
38
 
39
+ Args:
40
+ text (list): A list of sentences.
41
+ text_index (int): The index of the sentence to find the source for.
42
+ sentences_df (pd.DataFrame): A DF to store sentence information.
43
 
44
+ Returns:
45
+ tuple: A tuple of the updated sentences_df and a list of image URLs.
46
+ If a source is found, the DF is updated with source information.
47
+ If no source is found, the DF is updated with the original input.
48
+ """
49
+ checked_urls = (
50
+ set()
51
+ ) # Keep track of visited URLs to avoid redundant checks
52
  searched_phrases = generate_search_phrases(text[text_index])
53
 
54
  for candidate in searched_phrases:
55
+ # Search Google for the generated phrase
56
  search_results = search_by_google(candidate)
57
+
58
+ # Extract URLs from search results
59
  urls = [item["link"] for item in search_results.get("items", [])]
60
 
61
+ # Check the top 3 URLs from the search results
62
+ for url in urls[:TOP_URLS_PER_SEARCH]:
63
+ if url in checked_urls: # Skip already checked URLs
64
  continue
65
+ if "bbc.com" not in url: # TODO: remove when releasing
66
  continue
67
 
68
  checked_urls.add(url)
 
108
  if c in sentences_df.columns:
109
  sentences_df.loc[text_index, c] = aligned_sentence[c]
110
 
111
+ # Check other sentences for better matches in the same source
112
  for idx, _ in sentences_df.iterrows():
113
  similarity = sentences_df.loc[idx, "similarity"]
114
  if similarity is not None:
115
  if similarity > PARAPHRASE_THRESHOLD_MACHINE:
116
  continue
117
 
 
118
  aligned_sentence = check_paraphrase(
119
  text[idx],
120
  source_text,
 
137
  sentences_df.loc[idx, c] = aligned_sentence[c]
138
  return sentences_df, content.images
139
 
140
+ # If no source is found, update the DF with the original input
141
  sentences_df.loc[text_index, "input"] = text[text_index]
142
  return sentences_df, []
143
 
144
 
145
+ def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  """
147
+ Checks if the input text is a paraphrase of the source text
148
+ by comparing sentence-level similarities.
149
 
150
  Args:
151
+ input_text (str): The text to be checked for paraphrasing.
152
+ source_text (str): The source text to compare against.
153
+ url (str): The URL of the source text (for storing in the result).
154
 
155
  Returns:
156
+ dict: A dictionary containing the alignment information, including:
157
+ - "input": Concatenated input sentences.
158
+ - "source": Concatenated best-matched source sentences.
159
+ - "similarity": Average cosine similarity score.
160
+ - "label": Label determined based on similarity.
161
+ - "paraphrase": Boolean indicating if it's a paraphrase.
162
+ - "url": The source URL.
163
  """
 
164
  # Extract sentences from input text and web page
165
  input_sentences = split_into_sentences(input_text)
166
 
167
  if not source_text:
168
  return {}
 
169
  source_sentences = split_into_sentences(source_text)
170
+
171
  if not input_sentences or not source_sentences:
172
  return {}
173
 
174
+ # Handle external references in source sentences
175
+ # This is specified for bbc news articles
176
  additional_sentences = []
177
  for sentence in source_sentences:
178
  if ", external" in sentence:
179
  additional_sentences.append(sentence.replace(", external", ""))
180
  source_sentences.extend(additional_sentences)
181
 
182
+ # Encode sentences into embeddings using the PARAPHASE_MODEL
183
+ embeddings1 = PARAPHRASE_MODEL.encode(
184
  input_sentences,
185
  convert_to_tensor=True,
186
  device=DEVICE,
187
  show_progress_bar=False,
188
  )
189
+ embeddings2 = PARAPHRASE_MODEL.encode(
190
  source_sentences,
191
  convert_to_tensor=True,
192
  device=DEVICE,
 
199
  # Find sentence alignments
200
  inputs = ""
201
  sources = ""
202
+ similarities = []
203
+
204
  for i, sentence in enumerate(input_sentences):
205
  max_sim_index = np.argmax(similarity_matrix[i])
206
  max_similarity = similarity_matrix[i][max_sim_index]
207
  best_matched_sentence = source_sentences[max_sim_index]
208
+
209
  inputs += sentence + " "
210
  sources += best_matched_sentence + " "
211
  similarities.append(max_similarity)
212
 
213
+ # Calculate average similarity and determine paraphrase label
214
  similarity = sum(similarities) / len(similarities)
215
  label, is_paraphrased = determine_label(max_similarity)
216
+
217
+ # Create the alignment dictionary
218
  alignment = {
219
+ "input": inputs,
220
+ "source": sources,
221
+ "similarity": similarity,
222
+ "label": label,
223
+ "paraphrase": is_paraphrased,
224
+ "url": url,
225
+ }
226
+
227
  print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
228
 
229
  return alignment
230
 
231
 
232
+ def determine_label(similarity: float) -> tuple[str | None, bool]:
233
  """
234
+ Determines a label and paraphrase status based on the similarity score.
235
 
236
  Args:
237
+ similarity (float): The similarity score between two texts.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  Returns:
240
+ tuple: A tuple containing the label (str or None)
241
+ and a boolean indicating if it's a paraphrase.
242
  """
 
 
 
 
 
 
 
 
 
243
  if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
244
+ return "HUMAN", True # Human paraphrase
245
  elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
246
+ return "MACHINE", True # Machine paraphrase
247
  else:
248
+ return None, False # Not a paraphrase
249
 
250
 
251
  if __name__ == "__main__":
test.py CHANGED
@@ -1,74 +1,3 @@
1
- import re
2
-
3
- def is_newline_after_text(text1, text2):
4
- """
5
- Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
6
-
7
- Args:
8
- text1: The text to search for.
9
- text2: The text to search within.
10
-
11
- Returns:
12
- A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
13
- """
14
-
15
- match = re.search(re.escape(text1), text2) #escape text1 to handle special characters
16
-
17
- if match:
18
- # Find the next non-space character
19
- next_char_index = match.end()
20
- while next_char_index < len(text2) and text2[next_char_index].isspace():
21
- next_char_index += 1
22
-
23
- if text2[next_char_index:next_char_index+2] == r'\n':
24
- print("newline found")
25
- if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
26
- return True
27
-
28
- return False
29
-
30
- def is_newline_after_text_2(text1, text2):
31
- """
32
- Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
33
-
34
- Args:
35
- text1: The text to search for.
36
- text2: The text to search within.
37
-
38
- Returns:
39
- True if next char is newline
40
- """
41
- text2 = text2.replace("\n", "\\n")
42
-
43
- ater_text = text2.split(text1)
44
- if len(ater_text) > 1:
45
- ater_text = ater_text[1].lstrip() # Remove spaces
46
- if ater_text.startswith('\n'):
47
- return True
48
- return False
49
-
50
- # Example usage:
51
- text1 = "hello"
52
- text2 = "some text hello \nmore text"
53
- result = is_newline_after_text_2(text1, text2)
54
- print(f"Next char is newline: {result}\n")
55
-
56
- text1 = "hello"
57
- text2 = "some text hello more text"
58
- result = is_newline_after_text_2(text1, text2)
59
- print(f"Next char is newline: {result}\n")
60
-
61
- text1 = "hello"
62
- text2 = "some text hello \nmore text"
63
- result = is_newline_after_text_2(text1, text2)
64
- print(f"Next char is newline: {result}\n")
65
-
66
- text1 = "hello"
67
- text2 = "some text hello\t\nmore text" #test tab space before newline
68
- result = is_newline_after_text_2(text1, text2)
69
- print(f"Next char is newline: {result}\n")
70
-
71
- text1 = "hello." #test special characters
72
- text2 = "some text hello. \nmore text"
73
- result = is_newline_after_text_2(text1, text2)
74
- print(f"Next char is newline: {result}\n")
 
1
+ a = [1, 2]
2
+ a.append(None)
3
+ print(a)