pmkhanh7890 commited on
Commit
7e6ffb4
·
1 Parent(s): 38fd181

update algorithm

Browse files
application.py CHANGED
@@ -251,4 +251,4 @@ between the input text and the source.
251
  ],
252
  )
253
 
254
- demo.launch(share=True)
 
251
  ],
252
  )
253
 
254
+ demo.launch(share=False)
src/application/content_detection.py CHANGED
@@ -1,5 +1,6 @@
1
  from difflib import SequenceMatcher
2
 
 
3
  import pandas as pd
4
 
5
  from src.application.image.image_detection import (
@@ -17,7 +18,7 @@ from src.application.text.preprocessing import split_into_paragraphs
17
  from src.application.text.search_detection import (
18
  check_human,
19
  detect_text_by_relative_search,
20
- find_text_source,
21
  )
22
 
23
 
@@ -41,8 +42,8 @@ class NewsVerification:
41
  self.aligned_sentences: list[dict] = []
42
  self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
43
  columns=[
44
- "input_sentence",
45
- "matched_sentence",
46
  "label",
47
  "similarity",
48
  "paraphrase",
@@ -65,105 +66,19 @@ class NewsVerification:
65
  self.news_image = news_image
66
 
67
  def determine_text_origin(self):
68
- """
69
- Determines the origin of the given text based on paraphrasing detection
70
- and human authorship analysis.
 
 
 
71
 
72
- Args:
73
- text: The input text to be analyzed.
74
 
75
- Returns:
76
- str: The predicted origin of the text:
77
- - "HUMAN": If the text is likely written by a human.
78
- - "MACHINE": If the text is likely generated by a machine.
79
- """
80
- print("CHECK TEXT:")
81
- print("\tFrom search engine:")
82
- # Classify by search engine
83
- input_sentences = split_into_paragraphs(self.news_text)
84
- current_index = 0
85
- previous_paraphrase = None
86
- ai_sentence = {
87
- "input_sentence": "",
88
- "matched_sentence": "",
89
- "label": "",
90
- "similarity": None,
91
- "paraphrase": False,
92
- "url": "",
93
- }
94
 
95
- for index, sentence in enumerate(input_sentences):
96
- print(f"-------index = {index}-------")
97
- print(f"current_sentence = {input_sentences[index]}")
98
-
99
- if current_index >= len(input_sentences):
100
- break
101
- if (
102
- current_index > index
103
- and index != 0
104
- and index != len(input_sentences) - 1
105
- ):
106
- continue
107
-
108
- (
109
- paraphrase,
110
- text_url,
111
- searched_sentences,
112
- img_urls,
113
- current_index,
114
- ) = detect_text_by_relative_search(input_sentences, index)
115
-
116
- if paraphrase is False:
117
- # add sentence to ai_sentence
118
- if ai_sentence["input_sentence"] != "":
119
- ai_sentence["input_sentence"] += "<br>"
120
- ai_sentence["input_sentence"] += sentence
121
- if index == len(input_sentences) - 1:
122
- # add ai_sentences to align_sentences
123
- text_prediction_label, text_prediction_score = (
124
- detect_text_by_ai_model(ai_sentence["input_sentence"])
125
- )
126
- ai_sentence["label"] = text_prediction_label
127
- ai_sentence["similarity"] = text_prediction_score
128
- self.aligned_sentences.append(ai_sentence)
129
- else:
130
- if previous_paraphrase is False or previous_paraphrase is None:
131
- # add ai_sentences to align_sentences
132
- if ai_sentence[
133
- "input_sentence"
134
- ] != "" or current_index >= len(input_sentences):
135
- text_prediction_label, text_prediction_score = (
136
- detect_text_by_ai_model(
137
- ai_sentence["input_sentence"],
138
- )
139
- )
140
- ai_sentence["label"] = text_prediction_label
141
- ai_sentence["similarity"] = text_prediction_score
142
- self.aligned_sentences.append(ai_sentence)
143
-
144
- # reset
145
- ai_sentence = {
146
- "input_sentence": "",
147
- "matched_sentence": "",
148
- "label": "",
149
- "similarity": None,
150
- "paraphrase": False,
151
- "url": "",
152
- }
153
-
154
- # add searched_sentences to align_sentences
155
- if searched_sentences["input_sentence"] != "":
156
- self.found_img_url.extend(img_urls)
157
- if check_human(searched_sentences):
158
- searched_sentences["label"] = "HUMAN"
159
- else:
160
- searched_sentences["label"] = "MACHINE"
161
-
162
- self.aligned_sentences.append(searched_sentences)
163
-
164
- previous_paraphrase = paraphrase
165
-
166
- def determine_text_origin_2(self):
167
  """
168
  Determines the origin of the given text based on paraphrasing detection
169
  and human authorship analysis.
@@ -180,25 +95,56 @@ class NewsVerification:
180
  print("\tFrom search engine:")
181
  # Classify by search engine
182
  input_sentences = split_into_paragraphs(self.news_text)
183
- for _ in range(5):
 
 
 
184
  self.aligned_sentences_df = pd.concat(
185
- [self.aligned_sentences_df, pd.DataFrame([{}])],
186
- ignore_index=False,
 
 
 
 
 
 
 
 
187
  )
188
 
 
189
  for index, sentence in enumerate(input_sentences):
 
 
 
190
  print(f"-------index = {index}-------")
191
  print(f"current_sentence = {input_sentences[index]}")
192
 
193
- if self.aligned_sentences_df["url"] is not None:
194
- continue
195
-
196
- self.aligned_sentences_df, img_urls = find_text_source(
197
- input_sentences[index],
198
  self.aligned_sentences_df,
199
  )
200
 
201
- def detect_image_origin(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  print("CHECK IMAGE:")
203
  if self.news_image is None:
204
  self.image_prediction_label = "UNKNOWN"
@@ -268,15 +214,15 @@ class NewsVerification:
268
 
269
  def generate_analysis_report(self):
270
  self.determine_text_origin()
271
- self.detect_image_origin()
272
 
273
  def analyze_details(self):
274
  entities_with_colors = []
275
  for index, aligned_sentence in enumerate(self.aligned_sentences):
276
  # Get entity-words (in pair) with colors
277
  entities_with_colors = highlight_entities(
278
- aligned_sentence["input_sentence"],
279
- aligned_sentence["matched_sentence"],
280
  )
281
  self.aligned_sentences[index]["entities"] = entities_with_colors
282
 
@@ -332,19 +278,19 @@ class NewsVerification:
332
  rows.append(self.format_image_fact_checker_row(max_length))
333
 
334
  for aligned_sentence in self.aligned_sentences:
335
- if "input_sentence" not in aligned_sentence:
336
  continue
337
 
338
  # Get index of equal phrases in input and source sentences
339
  equal_idx_1, equal_idx_2 = extract_equal_text(
340
- aligned_sentence["input_sentence"],
341
- aligned_sentence["matched_sentence"],
342
  )
343
 
344
  # Get entity-words (in pair) with colors
345
  # entities_with_colors = highlight_entities(
346
- # aligned_sentence["input_sentence"],
347
- # aligned_sentence["matched_sentence"],
348
  # )
349
 
350
  self.fact_checker_table.append(
@@ -386,17 +332,17 @@ class NewsVerification:
386
 
387
  def format_text_fact_checker_row(self, row, max_length=30):
388
  entity_count = 0
389
- if row[0]["input_sentence"] == "":
390
  return ""
391
- if row[0]["matched_sentence"] != "": # source is not empty
392
  # highlight entities
393
  input_sentence, highlight_idx_input = apply_highlight(
394
- row[0]["input_sentence"],
395
  row[3],
396
  "input",
397
  )
398
  source_sentence, highlight_idx_source = apply_highlight(
399
- row[0]["matched_sentence"],
400
  row[3],
401
  "source",
402
  )
@@ -423,8 +369,8 @@ class NewsVerification:
423
  "span style",
424
  ).replace("1px_4px", "1px 4px")
425
  else:
426
- input_sentence = row[0]["input_sentence"]
427
- source_sentence = row[0]["matched_sentence"]
428
 
429
  label = row[0]["label"]
430
  score = row[0]["similarity"]
@@ -497,9 +443,9 @@ class NewsVerification:
497
  scores = 0
498
  sentence_count = 0
499
  for index, row in enumerate(self.aligned_sentences):
500
- if row["input_sentence"] == "":
501
  continue
502
- input_sentences += row["input_sentence"] + "<br><br>"
503
  label = self.aligned_sentences[index]["label"]
504
 
505
  url = self.aligned_sentences[index]["url"] #
@@ -539,19 +485,19 @@ class NewsVerification:
539
  rows.append(self.format_image_governor_row(max_length))
540
 
541
  for aligned_sentence in self.aligned_sentences:
542
- if "input_sentence" not in aligned_sentence:
543
  continue
544
 
545
  # Get index of equal phrases in input and source sentences
546
  equal_idx_1, equal_idx_2 = extract_equal_text(
547
- aligned_sentence["input_sentence"],
548
- aligned_sentence["matched_sentence"],
549
  )
550
 
551
  # Get entity-words (in pair) with colors
552
  # entities_with_colors = highlight_entities(
553
- # aligned_sentence["input_sentence"],
554
- # aligned_sentence["matched_sentence"],
555
  # )
556
 
557
  self.governor_table.append(
@@ -599,19 +545,19 @@ class NewsVerification:
599
  entity_count = 0
600
  for row in self.governor_table:
601
  print(f"governor_row: {row}")
602
- if row[0]["input_sentence"] == "":
603
  continue
604
 
605
- if row[0]["matched_sentence"] != "": # source is not empty
606
  # highlight entities
607
  input_sentence, highlight_idx_input = apply_highlight(
608
- row[0]["input_sentence"],
609
  row[3],
610
  "input",
611
  entity_count,
612
  )
613
  source_sentence, highlight_idx_source = apply_highlight(
614
- row[0]["matched_sentence"],
615
  row[3],
616
  "source",
617
  entity_count,
@@ -640,8 +586,8 @@ class NewsVerification:
640
  ).replace("1px_4px", "1px 4px")
641
 
642
  else:
643
- input_sentence = row[0]["input_sentence"]
644
- source_sentence = row[0]["matched_sentence"]
645
 
646
  # convert score to HUMAN-based score:
647
  input_sentences += input_sentence + "<br><br>"
@@ -819,7 +765,7 @@ class NewsVerification:
819
  machine_score = []
820
  machine_flag = False
821
  for sentence in self.aligned_sentences:
822
- if sentence["input_sentence"] == "":
823
  continue
824
  if sentence["label"] == "HUMAN":
825
  human_score.append(sentence["similarity"])
 
1
  from difflib import SequenceMatcher
2
 
3
+ import numpy as np
4
  import pandas as pd
5
 
6
  from src.application.image.image_detection import (
 
18
  from src.application.text.search_detection import (
19
  check_human,
20
  detect_text_by_relative_search,
21
+ find_paragraph_source,
22
  )
23
 
24
 
 
42
  self.aligned_sentences: list[dict] = []
43
  self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
44
  columns=[
45
+ "input",
46
+ "source",
47
  "label",
48
  "similarity",
49
  "paraphrase",
 
66
  self.news_image = news_image
67
 
68
  def determine_text_origin(self):
69
+ self.find_text_source()
70
+ label, score = self.verify_text()
71
+ if label == "UNKNOWN":
72
+ # Concatenate text from "input" in sentence_df
73
+ print(self.aligned_sentences_df["input"])
74
+ text = " ".join(self.aligned_sentences_df["input"].tolist())
75
 
76
+ # detect by baseline model
77
+ label, score = detect_text_by_ai_model(text)
78
 
79
+ return label, score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ def find_text_source(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  """
83
  Determines the origin of the given text based on paraphrasing detection
84
  and human authorship analysis.
 
95
  print("\tFrom search engine:")
96
  # Classify by search engine
97
  input_sentences = split_into_paragraphs(self.news_text)
98
+
99
+ # Setup df for input_sentences
100
+
101
+ for _ in range(len(input_sentences)):
102
  self.aligned_sentences_df = pd.concat(
103
+ [self.aligned_sentences_df, pd.DataFrame([{
104
+ "input": None,
105
+ "source": None,
106
+ "label": None,
107
+ "similarity": None,
108
+ "paraphrase": None,
109
+ "url": None,
110
+ "entities": None,
111
+ }])],
112
+ ignore_index=True,
113
  )
114
 
115
+ # find a source for each paragraph
116
  for index, sentence in enumerate(input_sentences):
117
+ if self.aligned_sentences_df.loc[index, "url"] is not None:
118
+ continue
119
+
120
  print(f"-------index = {index}-------")
121
  print(f"current_sentence = {input_sentences[index]}")
122
 
123
+ self.aligned_sentences_df, img_urls = find_paragraph_source(
124
+ input_sentences,
125
+ index,
 
 
126
  self.aligned_sentences_df,
127
  )
128
 
129
+ self.found_img_url.extend(img_urls)
130
+
131
+ # determine if the whole source is from a news or not
132
+
133
+ def verify_text(self):
134
+ # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
135
+ filtered_by_similarity = self.aligned_sentences_df[
136
+ self.aligned_sentences_df["similarity"] > 0.8
137
+ ]
138
+ if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 2:
139
+ avg_similarity = filtered_by_similarity.similarity.mean()
140
+ if avg_similarity > 0.963:
141
+ return "HUMAN", avg_similarity
142
+ if avg_similarity > 0.8:
143
+ return "MACHINE", avg_similarity
144
+
145
+ return "UNKNOWN", 0.0
146
+
147
+ def determine_image_origin(self):
148
  print("CHECK IMAGE:")
149
  if self.news_image is None:
150
  self.image_prediction_label = "UNKNOWN"
 
214
 
215
  def generate_analysis_report(self):
216
  self.determine_text_origin()
217
+ self.determine_image_origin()
218
 
219
  def analyze_details(self):
220
  entities_with_colors = []
221
  for index, aligned_sentence in enumerate(self.aligned_sentences):
222
  # Get entity-words (in pair) with colors
223
  entities_with_colors = highlight_entities(
224
+ aligned_sentence["input"],
225
+ aligned_sentence["source"],
226
  )
227
  self.aligned_sentences[index]["entities"] = entities_with_colors
228
 
 
278
  rows.append(self.format_image_fact_checker_row(max_length))
279
 
280
  for aligned_sentence in self.aligned_sentences:
281
+ if "input" not in aligned_sentence:
282
  continue
283
 
284
  # Get index of equal phrases in input and source sentences
285
  equal_idx_1, equal_idx_2 = extract_equal_text(
286
+ aligned_sentence["input"],
287
+ aligned_sentence["source"],
288
  )
289
 
290
  # Get entity-words (in pair) with colors
291
  # entities_with_colors = highlight_entities(
292
+ # aligned_sentence["input"],
293
+ # aligned_sentence["source"],
294
  # )
295
 
296
  self.fact_checker_table.append(
 
332
 
333
  def format_text_fact_checker_row(self, row, max_length=30):
334
  entity_count = 0
335
+ if row[0]["input"] == "":
336
  return ""
337
+ if row[0]["source"] != "": # source is not empty
338
  # highlight entities
339
  input_sentence, highlight_idx_input = apply_highlight(
340
+ row[0]["input"],
341
  row[3],
342
  "input",
343
  )
344
  source_sentence, highlight_idx_source = apply_highlight(
345
+ row[0]["source"],
346
  row[3],
347
  "source",
348
  )
 
369
  "span style",
370
  ).replace("1px_4px", "1px 4px")
371
  else:
372
+ input_sentence = row[0]["input"]
373
+ source_sentence = row[0]["source"]
374
 
375
  label = row[0]["label"]
376
  score = row[0]["similarity"]
 
443
  scores = 0
444
  sentence_count = 0
445
  for index, row in enumerate(self.aligned_sentences):
446
+ if row["input"] == "":
447
  continue
448
+ input_sentences += row["input"] + "<br><br>"
449
  label = self.aligned_sentences[index]["label"]
450
 
451
  url = self.aligned_sentences[index]["url"] #
 
485
  rows.append(self.format_image_governor_row(max_length))
486
 
487
  for aligned_sentence in self.aligned_sentences:
488
+ if "input" not in aligned_sentence:
489
  continue
490
 
491
  # Get index of equal phrases in input and source sentences
492
  equal_idx_1, equal_idx_2 = extract_equal_text(
493
+ aligned_sentence["input"],
494
+ aligned_sentence["source"],
495
  )
496
 
497
  # Get entity-words (in pair) with colors
498
  # entities_with_colors = highlight_entities(
499
+ # aligned_sentence["input"],
500
+ # aligned_sentence["source"],
501
  # )
502
 
503
  self.governor_table.append(
 
545
  entity_count = 0
546
  for row in self.governor_table:
547
  print(f"governor_row: {row}")
548
+ if row[0]["input"] == "":
549
  continue
550
 
551
+ if row[0]["source"] != "": # source is not empty
552
  # highlight entities
553
  input_sentence, highlight_idx_input = apply_highlight(
554
+ row[0]["input"],
555
  row[3],
556
  "input",
557
  entity_count,
558
  )
559
  source_sentence, highlight_idx_source = apply_highlight(
560
+ row[0]["source"],
561
  row[3],
562
  "source",
563
  entity_count,
 
586
  ).replace("1px_4px", "1px 4px")
587
 
588
  else:
589
+ input_sentence = row[0]["input"]
590
+ source_sentence = row[0]["source"]
591
 
592
  # convert score to HUMAN-based score:
593
  input_sentences += input_sentence + "<br><br>"
 
765
  machine_score = []
766
  machine_flag = False
767
  for sentence in self.aligned_sentences:
768
+ if sentence["input"] == "":
769
  continue
770
  if sentence["label"] == "HUMAN":
771
  human_score.append(sentence["similarity"])
src/application/text/search_detection.py CHANGED
@@ -4,6 +4,7 @@ from difflib import SequenceMatcher
4
 
5
  import nltk
6
  import numpy as np
 
7
  import torch
8
  from sentence_transformers import (
9
  SentenceTransformer,
@@ -30,13 +31,13 @@ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
  PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
31
  PARAPHASE_MODEL.to(DEVICE)
32
 
33
- BATCH_SIZE = 8
34
-
35
  PARAPHRASE_THRESHOLD = 0.8
36
- PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7
37
  MIN_SAME_SENTENCE_LEN = 6
38
  MIN_PHRASE_SENTENCE_LEN = 10
39
- MIN_RATIO_PARAPHRASE_NUM = 0.7
40
  MAX_CHAR_SIZE = 30000
41
 
42
 
@@ -73,15 +74,16 @@ def detect_text_by_relative_search(
73
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
74
  continue
75
  print(f"\t\t\t↑↑↑ Title: {content.title}")
76
- paraphrase, aligned_first_sentences = check_paraphrase(
77
  input_text[index],
78
  page_text,
79
  url,
80
  )
 
81
 
82
- if paraphrase is False:
83
  return (
84
- paraphrase,
85
  url,
86
  aligned_first_sentences,
87
  content.images,
@@ -96,29 +98,30 @@ def detect_text_by_relative_search(
96
  print(f"input_text_last: {input_text[-1]}")
97
  break
98
  print(f"input_text: {input_text[index]}")
99
- sub_paraphrase, sub_sentences = check_paraphrase(
100
  input_text[index],
101
  page_text,
102
  url,
103
  )
 
104
  print(f"sub_paraphrase: {sub_paraphrase}")
105
  print(f"sub_sentences: {sub_sentences}")
106
  if sub_paraphrase is True:
107
- aligned_first_sentences["input_sentence"] += (
108
- "<br>" + sub_sentences["input_sentence"]
109
  )
110
- aligned_first_sentences["matched_sentence"] += (
111
- "<br>" + sub_sentences["matched_sentence"]
112
  )
113
  aligned_first_sentences["similarity"] += sub_sentences[
114
  "similarity"
115
  ]
116
  aligned_first_sentences["similarity"] /= 2
117
 
118
- print(f"paraphrase: {paraphrase}")
119
  print(f"aligned_first_sentences: {aligned_first_sentences}")
120
  return (
121
- paraphrase,
122
  url,
123
  aligned_first_sentences,
124
  content.images,
@@ -128,19 +131,12 @@ def detect_text_by_relative_search(
128
  return False, None, [], [], index
129
 
130
 
131
- def find_text_source(text, text_index, sentences_df):
132
- sentence = {
133
- "input_sentence": text[text_index],
134
- "matched_sentence": "",
135
- "label": "",
136
- "similarity": None,
137
- "paraphrase": None,
138
- "url": "",
139
- "group": None,
140
- }
141
  checked_urls = set()
142
  searched_phrases = generate_search_phrases(text[text_index])
143
-
 
144
  for candidate in searched_phrases:
145
  search_results = search_by_google(candidate)
146
  urls = [item["link"] for item in search_results.get("items", [])]
@@ -166,63 +162,56 @@ def find_text_source(text, text_index, sentences_df):
166
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
167
  continue
168
  print(f"\t\t\t↑↑↑ Title: {content.title}")
169
- paraphrase, aligned_sentence = check_paraphrase(
170
- text,
171
  page_text,
172
  url,
173
  )
174
 
175
- # add one more key "group" into aligned_sentence
176
- sentences_df.loc[text_index, "input_sentence"] = (
177
- aligned_sentence["input_sentence"]
178
- )
179
- sentences_df.loc[text_index, "matched_sentence"] = (
180
- aligned_sentence["matched_sentence"]
181
- )
182
- sentences_df.loc[text_index, "label"] = aligned_sentence[
183
- "label"
184
- ]
185
- sentences_df.loc[text_index, "similarity"] = aligned_sentence[
186
- "similarity"
187
- ]
188
- sentences_df.loc[text_index, "url"] = aligned_sentence["url"]
189
-
190
  if aligned_sentence["paraphrase"] is False:
191
- return paraphrase, sentences_df
192
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  for text_index, _ in enumerate(sentences_df):
194
- if sentences_df[text_index]["url"] is not None:
 
195
  continue
196
 
197
  # find content in new url
198
- _, aligned_sentence = check_paraphrase(
199
  text[text_index],
200
  page_text,
201
  url,
202
  )
203
 
204
- if aligned_sentence["url"] is not None:
205
  continue
206
 
207
- sentences_df.loc[text_index, "input_sentence"] = (
208
- aligned_sentence["input_sentence"]
209
- )
210
- sentences_df.loc[text_index, "matched_sentence"] = (
211
- aligned_sentence["matched_sentence"]
212
- )
213
- sentences_df.loc[text_index, "label"] = aligned_sentence[
214
- "label"
215
- ]
216
- sentences_df.loc[text_index, "similarity"] = (
217
- aligned_sentence["similarity"]
218
- )
219
- sentences_df.loc[text_index, "url"] = aligned_sentence[
220
- "url"
221
- ]
222
 
223
  return sentences_df, content.images
224
 
225
- return sentence, []
226
 
227
 
228
  def longest_common_subsequence(arr1, arr2):
@@ -331,36 +320,31 @@ def check_paraphrase(input_text, page_text, url):
331
  A tuple containing:
332
 
333
  """
334
- is_paraphrase_text = False
335
-
336
- if not isinstance(input_text, str) or not isinstance(page_text, str):
337
- return False, []
338
 
339
  # Extract sentences from input text and web page
340
- # input_sentences = split_into_paragraphs(input_text)
341
- input_sentences = [input_text]
342
 
343
  if not page_text:
344
- return is_paraphrase_text, []
345
 
346
- page_sentences = split_into_paragraphs(page_text)
347
- if not input_sentences or not page_sentences:
348
- return is_paraphrase_text, []
349
 
350
  additional_sentences = []
351
- for sentence in page_sentences:
352
  if ", external" in sentence:
353
  additional_sentences.append(sentence.replace(", external", ""))
354
- page_sentences.extend(additional_sentences)
355
 
356
  # Encode sentences into embeddings
357
  embeddings1 = PARAPHASE_MODEL.encode(
358
- input_sentences,
359
  convert_to_tensor=True,
360
  device=DEVICE,
361
  )
362
  embeddings2 = PARAPHASE_MODEL.encode(
363
- page_sentences,
364
  convert_to_tensor=True,
365
  device=DEVICE,
366
  )
@@ -370,69 +354,28 @@ def check_paraphrase(input_text, page_text, url):
370
 
371
  # Find sentence alignments
372
  alignment = {}
373
- paraphrased_sentence_count = 0
374
- best_matched_sentence = ""
375
- for i, sentence1 in enumerate(input_sentences):
376
  max_sim_index = np.argmax(similarity_matrix[i])
377
  max_similarity = similarity_matrix[i][max_sim_index]
378
 
379
- best_matched_sentence = page_sentences[max_sim_index]
380
- is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
381
-
382
- if is_paraphrase_sentence is False:
383
- alignment = {
384
- "input_sentence": sentence1,
385
- "matched_sentence": "",
386
- "similarity": max_similarity,
387
- "label": "",
388
- "paraphrase": is_paraphrase_sentence,
389
- "url": "",
390
- }
391
  else:
392
- alignment = {
393
- "input_sentence": sentence1,
394
- "matched_sentence": page_sentences[max_sim_index],
395
- "similarity": max_similarity,
396
- "label": "",
397
- "paraphrase": is_paraphrase_sentence,
398
- "url": url,
399
- }
400
-
401
- # Check for individual sentence paraphrase
402
- # if overall paraphrase not yet found
403
- if not is_paraphrase_text and check_sentence(
404
- sentence1,
405
- page_sentences[max_sim_index],
406
- MIN_SAME_SENTENCE_LEN,
407
- MIN_PHRASE_SENTENCE_LEN,
408
- ):
409
- is_paraphrase_text = True
410
-
411
- # alignment.append(item)
412
- paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
413
-
414
- # Check if enough sentences are paraphrases
415
-
416
- is_paraphrase_text = (
417
- paraphrased_sentence_count > 0
418
- ) # min_matching_sentences
419
-
420
- # Method 2: Check if overlapped words between sentences are more than 50%
421
- equal_idx_1, _ = extract_equal_text(
422
- input_sentences[0],
423
- best_matched_sentence,
424
- )
425
- matched_count = 0
426
- for index in equal_idx_1:
427
- matched_count += index["end"] - index["start"]
428
- sent = input_sentences[0].translate(
429
- str.maketrans("", "", string.punctuation),
430
- )
431
- num_words = len(sent.split())
432
- if matched_count > num_words / 2:
433
- is_paraphrase_text = True
434
 
435
- return is_paraphrase_text, alignment
436
 
437
 
438
  def similarity_ratio(a, b):
@@ -472,6 +415,14 @@ def check_human(alligned_sentences):
472
  return True
473
  return False
474
 
 
 
 
 
 
 
 
 
475
 
476
  if __name__ == "__main__":
477
  pass
 
4
 
5
  import nltk
6
  import numpy as np
7
+ import pandas as pd
8
  import torch
9
  from sentence_transformers import (
10
  SentenceTransformer,
 
31
  PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
32
  PARAPHASE_MODEL.to(DEVICE)
33
 
34
+ PARAPHRASE_THRESHOLD_HUMAN = 0.963
35
+ PARAPHRASE_THRESHOLD_MACHINE = 0.8
36
  PARAPHRASE_THRESHOLD = 0.8
37
+
38
  MIN_SAME_SENTENCE_LEN = 6
39
  MIN_PHRASE_SENTENCE_LEN = 10
40
+ MIN_RATIO_PARAPHRASE_NUM = 0.5
41
  MAX_CHAR_SIZE = 30000
42
 
43
 
 
74
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
75
  continue
76
  print(f"\t\t\t↑↑↑ Title: {content.title}")
77
+ aligned_first_sentences = check_paraphrase(
78
  input_text[index],
79
  page_text,
80
  url,
81
  )
82
+ is_paraphrased = aligned_first_sentences["is_paraphrased"]
83
 
84
+ if is_paraphrased is False:
85
  return (
86
+ is_paraphrased,
87
  url,
88
  aligned_first_sentences,
89
  content.images,
 
98
  print(f"input_text_last: {input_text[-1]}")
99
  break
100
  print(f"input_text: {input_text[index]}")
101
+ sub_sentences = check_paraphrase(
102
  input_text[index],
103
  page_text,
104
  url,
105
  )
106
+ sub_paraphrase = sub_sentences["is_paraphrased"]
107
  print(f"sub_paraphrase: {sub_paraphrase}")
108
  print(f"sub_sentences: {sub_sentences}")
109
  if sub_paraphrase is True:
110
+ aligned_first_sentences["input"] += (
111
+ "<br>" + sub_sentences["input"]
112
  )
113
+ aligned_first_sentences["source"] += (
114
+ "<br>" + sub_sentences["source"]
115
  )
116
  aligned_first_sentences["similarity"] += sub_sentences[
117
  "similarity"
118
  ]
119
  aligned_first_sentences["similarity"] /= 2
120
 
121
+ print(f"paraphrase: {is_paraphrased}")
122
  print(f"aligned_first_sentences: {aligned_first_sentences}")
123
  return (
124
+ is_paraphrased,
125
  url,
126
  aligned_first_sentences,
127
  content.images,
 
131
  return False, None, [], [], index
132
 
133
 
134
+ def find_paragraph_source(text, text_index, sentences_df):
135
+
 
 
 
 
 
 
 
 
136
  checked_urls = set()
137
  searched_phrases = generate_search_phrases(text[text_index])
138
+ print(f"text[text_index]: {text[text_index]}")
139
+ print(f"searched_phrases: {searched_phrases}")
140
  for candidate in searched_phrases:
141
  search_results = search_by_google(candidate)
142
  urls = [item["link"] for item in search_results.get("items", [])]
 
162
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
163
  continue
164
  print(f"\t\t\t↑↑↑ Title: {content.title}")
165
+ aligned_sentence = check_paraphrase(
166
+ text[text_index],
167
  page_text,
168
  url,
169
  )
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  if aligned_sentence["paraphrase"] is False:
172
+ print(f'sentence_1: {sentences_df.loc[text_index, "input"]}')
173
+ print(f'sentence_2: {aligned_sentence["input"]}')
174
+ sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
175
+ sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
176
+ return sentences_df, []
177
+ # assign values
178
+ columns = [
179
+ "input",
180
+ "source",
181
+ "label",
182
+ "similarity",
183
+ "paraphrase",
184
+ "url",
185
+ ]
186
+ for c in columns:
187
+ if c in sentences_df.columns:
188
+ sentences_df.loc[text_index, c] = aligned_sentence[c]
189
+
190
+
191
+ print(f"sen: {sentences_df}")
192
  for text_index, _ in enumerate(sentences_df):
193
+ print(f"{text_index}")
194
+ if sentences_df.loc[text_index, "url"] is not None:
195
  continue
196
 
197
  # find content in new url
198
+ aligned_sentence = check_paraphrase(
199
  text[text_index],
200
  page_text,
201
  url,
202
  )
203
 
204
+ if aligned_sentence["url"] is None:
205
  continue
206
 
207
+ columns = ["input", "source", "label", "similarity", "url"]
208
+ for c in columns:
209
+ if c in sentences_df.columns:
210
+ sentences_df.loc[text_index, c] = aligned_sentence[c]
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  return sentences_df, content.images
213
 
214
+ return sentences_df, []
215
 
216
 
217
  def longest_common_subsequence(arr1, arr2):
 
320
  A tuple containing:
321
 
322
  """
 
 
 
 
323
 
324
  # Extract sentences from input text and web page
325
+ input_paragraphs = [input_text]
 
326
 
327
  if not page_text:
328
+ return {}
329
 
330
+ page_paragraphs = split_into_paragraphs(page_text)
331
+ if not input_paragraphs or not page_paragraphs:
332
+ return {}
333
 
334
  additional_sentences = []
335
+ for sentence in page_paragraphs:
336
  if ", external" in sentence:
337
  additional_sentences.append(sentence.replace(", external", ""))
338
+ page_paragraphs.extend(additional_sentences)
339
 
340
  # Encode sentences into embeddings
341
  embeddings1 = PARAPHASE_MODEL.encode(
342
+ input_paragraphs,
343
  convert_to_tensor=True,
344
  device=DEVICE,
345
  )
346
  embeddings2 = PARAPHASE_MODEL.encode(
347
+ page_paragraphs,
348
  convert_to_tensor=True,
349
  device=DEVICE,
350
  )
 
354
 
355
  # Find sentence alignments
356
  alignment = {}
357
+ for i, paragraph in enumerate(input_paragraphs):
 
 
358
  max_sim_index = np.argmax(similarity_matrix[i])
359
  max_similarity = similarity_matrix[i][max_sim_index]
360
 
361
+ label, is_paraphrased = determine_label(max_similarity)
362
+ print(f"is_paraphrased: {is_paraphrased}")
363
+ if is_paraphrased is False:
364
+ url = None
365
+ best_matched_paragraph = None
 
 
 
 
 
 
 
366
  else:
367
+ best_matched_paragraph = page_paragraphs[max_sim_index]
368
+
369
+ alignment = {
370
+ "input": paragraph,
371
+ "source": best_matched_paragraph,
372
+ "similarity": max_similarity,
373
+ "label": label,
374
+ "paraphrase": is_paraphrased,
375
+ "url": url,
376
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
+ return alignment
379
 
380
 
381
  def similarity_ratio(a, b):
 
415
  return True
416
  return False
417
 
418
+ def determine_label(similarity):
419
+ if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
420
+ return "HUMAN", True
421
+ elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
422
+ return "MACHINE", True
423
+ else:
424
+ return "", False
425
+
426
 
427
  if __name__ == "__main__":
428
  pass