pmkhanh7890 commited on
Commit
a5e8d12
·
1 Parent(s): 0260491

Separate each row by paragraph

Browse files
gpt_test.py CHANGED
@@ -96,12 +96,12 @@ azure_client = AzureOpenAI(
96
  api_version="2024-05-01-preview",
97
  )
98
 
99
- deplopment_name = "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
100
  TEXT_PROMPT = """
101
  Paraphrase the following news, only output the paraphrased text:
102
 
103
  """
104
- text = get_first_column("data/MAGE.csv")
105
  count = 0
106
  for index, news in enumerate(text):
107
  if count > 1000:
@@ -127,4 +127,4 @@ for index, news in enumerate(text):
127
  count += 1
128
  paraphrased_news = response.choices[0].message.content
129
 
130
- add_text_to_csv("data/MAGE_4o_mini.csv", paraphrased_news, count)
 
96
  api_version="2024-05-01-preview",
97
  )
98
 
99
+ deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
100
  TEXT_PROMPT = """
101
  Paraphrase the following news, only output the paraphrased text:
102
 
103
  """
104
+ text = get_first_column("data/MAGE_2.csv")
105
  count = 0
106
  for index, news in enumerate(text):
107
  if count > 1000:
 
127
  count += 1
128
  paraphrased_news = response.choices[0].message.content
129
 
130
+ add_text_to_csv("data/MAGE_2_4o.csv", paraphrased_news, count)
src/application/content_detection.py CHANGED
@@ -16,10 +16,10 @@ from src.application.text.model_detection import (
16
  detect_text_by_ai_model,
17
  predict_generation_model,
18
  )
19
- from src.application.text.preprocessing import split_into_paragraphs
20
  from src.application.text.search_detection import (
21
  PARAPHRASE_THRESHOLD_MACHINE,
22
- find_paragraph_source,
23
  )
24
 
25
 
@@ -44,7 +44,7 @@ class NewsVerification:
44
  self.found_img_url: list[str] = []
45
 
46
  # Analyzed results
47
- self.aligned_paragraphs_df: pd.DataFrame = pd.DataFrame(
48
  columns=[
49
  "input",
50
  "source",
@@ -78,7 +78,7 @@ class NewsVerification:
78
  series.astype(str).tolist(),
79
  ) # Handle mixed data types and NaNs
80
 
81
- self.grouped_url_df = self.aligned_paragraphs_df.groupby("url").agg(
82
  {
83
  "input": concat_text,
84
  "source": concat_text,
@@ -89,7 +89,7 @@ class NewsVerification:
89
  self.grouped_url_df["label"] = None
90
  self.grouped_url_df["score"] = None
91
 
92
- print(f"aligned_paragraphs_df:\n {self.aligned_paragraphs_df}")
93
 
94
  for index, row in self.grouped_url_df.iterrows():
95
  label, score = self.verify_text(row["url"])
@@ -112,22 +112,20 @@ class NewsVerification:
112
  na=False,
113
  )
114
  ]
115
- # machine_label = self.aligned_paragraphs_df[
116
- # self.aligned_paragraphs_df["label"] == "MACHINE"
117
- # ]
118
  if len(machine_label) > 0:
119
  label = " ".join(machine_label["label"].tolist())
120
  self.text_prediction_label[0] = label
121
  self.text_prediction_score[0] = machine_label["score"].mean()
122
  else:
123
- machine_label = self.aligned_paragraphs_df[
124
- self.aligned_paragraphs_df["label"] == "HUMAN"
125
  ]
126
  self.text_prediction_label[0] = "HUMAN"
127
  self.text_prediction_score[0] = machine_label["score"].mean()
128
  else: # no source found in the input text
129
  print("No source found in the input text")
130
- text = " ".join(self.aligned_paragraphs_df["input"].tolist())
131
  # detect by baseline model
132
  label, score = detect_text_by_ai_model(text)
133
  self.text_prediction_label[0] = label
@@ -149,14 +147,15 @@ class NewsVerification:
149
  print("CHECK TEXT:")
150
  print("\tFrom search engine:")
151
  # Classify by search engine
152
- input_sentences = split_into_paragraphs(self.news_text)
 
153
 
154
  # Setup df for input_sentences
155
 
156
- for _ in range(len(input_sentences)):
157
- self.aligned_paragraphs_df = pd.concat(
158
  [
159
- self.aligned_paragraphs_df,
160
  pd.DataFrame(
161
  [
162
  {
@@ -174,20 +173,20 @@ class NewsVerification:
174
  ignore_index=True,
175
  )
176
 
177
- # find a source for each paragraph
178
- for index, _ in enumerate(input_sentences):
179
- similarity = self.aligned_paragraphs_df.loc[index, "similarity"]
180
  if similarity is not None:
181
  if similarity > PARAPHRASE_THRESHOLD_MACHINE:
182
  continue
183
 
184
  print(f"\n-------index = {index}-------")
185
- print(f"current_text = {input_sentences[index]}\n")
186
 
187
- self.aligned_paragraphs_df, img_urls = find_paragraph_source(
188
- input_sentences,
189
  index,
190
- self.aligned_paragraphs_df,
191
  )
192
 
193
  self.found_img_url.extend(img_urls)
@@ -199,13 +198,13 @@ class NewsVerification:
199
  score = 0
200
  # calculate the average similarity when the similary score
201
  # in each row of sentences_df is higher than 0.8
202
- filtered_by_url = self.aligned_paragraphs_df[
203
- self.aligned_paragraphs_df["url"] == url
204
  ]
205
  filtered_by_similarity = filtered_by_url[
206
  filtered_by_url["similarity"] > 0.8
207
  ]
208
- if len(filtered_by_similarity) / len(self.aligned_paragraphs_df) > 0.5:
209
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
210
  contains_machine = (
211
  filtered_by_similarity["label"]
@@ -305,9 +304,9 @@ class NewsVerification:
305
  row["source"],
306
  )
307
 
308
- for index, paragraph in self.aligned_paragraphs_df.iterrows():
309
- if paragraph["url"] == row["url"]:
310
- self.aligned_paragraphs_df.at[index, "entities"] = (
311
  entities_with_colors # must use at
312
  )
313
 
@@ -353,10 +352,9 @@ class NewsVerification:
353
 
354
  def create_fact_checker_table(self):
355
  rows = []
356
- max_length = 30 # TODO: put this in configuration
357
- rows.append(self.format_image_fact_checker_row(max_length))
358
 
359
- for _, row in self.aligned_paragraphs_df.iterrows():
360
  if row["input"] is None:
361
  continue
362
 
@@ -404,6 +402,8 @@ class NewsVerification:
404
  if span_row == 1:
405
  last_url_row = True
406
 
 
 
407
  formatted_row = self.format_text_fact_checker_row(
408
  row,
409
  first_url_row,
@@ -555,7 +555,7 @@ class NewsVerification:
555
  </tr>
556
  """
557
 
558
- def format_image_fact_checker_row(self, max_length=30):
559
 
560
  if (
561
  self.image_referent_url is not None
@@ -577,9 +577,8 @@ class NewsVerification:
577
 
578
  def create_ordinary_user_table(self):
579
  rows = []
580
- max_length = 30 # TODO: put this in configuration
581
- rows.append(self.format_image_ordinary_user_row(max_length))
582
- rows.append(self.format_text_ordinary_user_row(max_length))
583
  table = "\n".join(rows)
584
 
585
  return f"""
@@ -607,7 +606,7 @@ class NewsVerification:
607
  input_sentences = ""
608
  source_text_urls = ""
609
  urls = []
610
- for _, row in self.aligned_paragraphs_df.iterrows():
611
  if row["input"] is None:
612
  continue
613
  input_sentences += row["input"] + "<br><br>"
@@ -641,16 +640,14 @@ class NewsVerification:
641
 
642
  def create_governor_table(self):
643
  rows = []
644
- max_length = 30 # TODO: put this in configuration
645
- rows.append(self.format_image_governor_row(max_length))
646
 
647
- for _, row in self.aligned_paragraphs_df.iterrows():
648
  if row["input"] is None:
649
  continue
650
 
651
  if row["source"] is None:
652
  equal_idx_1 = equal_idx_2 = []
653
-
654
  else:
655
  # Get index of equal phrases in input and source sentences
656
  equal_idx_1, equal_idx_2 = extract_equal_text(
@@ -667,7 +664,7 @@ class NewsVerification:
667
  ],
668
  )
669
 
670
- formatted_row = self.format_text_governor_row(max_length)
671
  rows.append(formatted_row)
672
 
673
  table = "\n".join(rows)
@@ -694,7 +691,7 @@ class NewsVerification:
694
  <style>
695
  """
696
 
697
- def format_text_governor_row(self, max_length=30):
698
  input_sentences = ""
699
  source_sentences = ""
700
  source_text_urls = ""
@@ -705,9 +702,7 @@ class NewsVerification:
705
  if row[0]["input"] is None:
706
  continue
707
 
708
- if (
709
- row[0]["source"] is not None and row[3] is not None
710
- ): # source is not empty
711
  # highlight entities
712
  input_sentence, highlight_idx_input = apply_highlight(
713
  row[0]["input"],
@@ -779,7 +774,7 @@ class NewsVerification:
779
  </tr>
780
  """
781
 
782
- def format_image_governor_row(self, max_length=30):
783
  if (
784
  self.image_referent_url is not None
785
  or self.image_referent_url != ""
@@ -803,7 +798,7 @@ class NewsVerification:
803
  return entity_count_text
804
 
805
  def color_text(self, text, colored_idx, highlighted_idx):
806
- paragraph = ""
807
  words = text.split()
808
 
809
  starts, ends = self.extract_starts_ends(colored_idx)
@@ -811,16 +806,16 @@ class NewsVerification:
811
 
812
  previous_end = 0
813
  for start, end in zip(starts, ends):
814
- paragraph += " ".join(words[previous_end:start])
815
 
816
  equal_words = " ".join(words[start:end])
817
- paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
818
 
819
  previous_end = end
820
 
821
- paragraph += " ".join(words[previous_end:])
822
 
823
- return paragraph
824
 
825
  def extract_starts_ends(self, colored_idx):
826
  starts = []
 
16
  detect_text_by_ai_model,
17
  predict_generation_model,
18
  )
19
+ from src.application.text.preprocessing import split_into_paragraphs, split_into_sentences
20
  from src.application.text.search_detection import (
21
  PARAPHRASE_THRESHOLD_MACHINE,
22
+ find_sentence_source,
23
  )
24
 
25
 
 
44
  self.found_img_url: list[str] = []
45
 
46
  # Analyzed results
47
+ self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
48
  columns=[
49
  "input",
50
  "source",
 
78
  series.astype(str).tolist(),
79
  ) # Handle mixed data types and NaNs
80
 
81
+ self.grouped_url_df = self.aligned_sentences_df.groupby("url").agg(
82
  {
83
  "input": concat_text,
84
  "source": concat_text,
 
89
  self.grouped_url_df["label"] = None
90
  self.grouped_url_df["score"] = None
91
 
92
+ print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
93
 
94
  for index, row in self.grouped_url_df.iterrows():
95
  label, score = self.verify_text(row["url"])
 
112
  na=False,
113
  )
114
  ]
115
+
 
 
116
  if len(machine_label) > 0:
117
  label = " ".join(machine_label["label"].tolist())
118
  self.text_prediction_label[0] = label
119
  self.text_prediction_score[0] = machine_label["score"].mean()
120
  else:
121
+ machine_label = self.aligned_sentences_df[
122
+ self.aligned_sentences_df["label"] == "HUMAN"
123
  ]
124
  self.text_prediction_label[0] = "HUMAN"
125
  self.text_prediction_score[0] = machine_label["score"].mean()
126
  else: # no source found in the input text
127
  print("No source found in the input text")
128
+ text = " ".join(self.aligned_sentences_df["input"].tolist())
129
  # detect by baseline model
130
  label, score = detect_text_by_ai_model(text)
131
  self.text_prediction_label[0] = label
 
147
  print("CHECK TEXT:")
148
  print("\tFrom search engine:")
149
  # Classify by search engine
150
+ #input_sentences = split_into_sentences(self.news_text)
151
+ input_paragraphs = split_into_paragraphs(self.news_text)
152
 
153
  # Setup df for input_sentences
154
 
155
+ for _ in range(len(input_paragraphs)):
156
+ self.aligned_sentences_df = pd.concat(
157
  [
158
+ self.aligned_sentences_df,
159
  pd.DataFrame(
160
  [
161
  {
 
173
  ignore_index=True,
174
  )
175
 
176
+ # find a source for each sentence
177
+ for index, _ in enumerate(input_paragraphs):
178
+ similarity = self.aligned_sentences_df.loc[index, "similarity"]
179
  if similarity is not None:
180
  if similarity > PARAPHRASE_THRESHOLD_MACHINE:
181
  continue
182
 
183
  print(f"\n-------index = {index}-------")
184
+ print(f"current_text = {input_paragraphs[index]}\n")
185
 
186
+ self.aligned_sentences_df, img_urls = find_sentence_source(
187
+ input_paragraphs,
188
  index,
189
+ self.aligned_sentences_df,
190
  )
191
 
192
  self.found_img_url.extend(img_urls)
 
198
  score = 0
199
  # calculate the average similarity when the similary score
200
  # in each row of sentences_df is higher than 0.8
201
+ filtered_by_url = self.aligned_sentences_df[
202
+ self.aligned_sentences_df["url"] == url
203
  ]
204
  filtered_by_similarity = filtered_by_url[
205
  filtered_by_url["similarity"] > 0.8
206
  ]
207
+ if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 0.5:
208
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
209
  contains_machine = (
210
  filtered_by_similarity["label"]
 
304
  row["source"],
305
  )
306
 
307
+ for index, sentence in self.aligned_sentences_df.iterrows():
308
+ if sentence["url"] == row["url"]:
309
+ self.aligned_sentences_df.at[index, "entities"] = (
310
  entities_with_colors # must use at
311
  )
312
 
 
352
 
353
  def create_fact_checker_table(self):
354
  rows = []
355
+ rows.append(self.format_image_fact_checker_row())
 
356
 
357
+ for _, row in self.aligned_sentences_df.iterrows():
358
  if row["input"] is None:
359
  continue
360
 
 
402
  if span_row == 1:
403
  last_url_row = True
404
 
405
+ # end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
406
+
407
  formatted_row = self.format_text_fact_checker_row(
408
  row,
409
  first_url_row,
 
555
  </tr>
556
  """
557
 
558
+ def format_image_fact_checker_row(self):
559
 
560
  if (
561
  self.image_referent_url is not None
 
577
 
578
  def create_ordinary_user_table(self):
579
  rows = []
580
+ rows.append(self.format_image_ordinary_user_row())
581
+ rows.append(self.format_text_ordinary_user_row())
 
582
  table = "\n".join(rows)
583
 
584
  return f"""
 
606
  input_sentences = ""
607
  source_text_urls = ""
608
  urls = []
609
+ for _, row in self.aligned_sentences_df.iterrows():
610
  if row["input"] is None:
611
  continue
612
  input_sentences += row["input"] + "<br><br>"
 
640
 
641
  def create_governor_table(self):
642
  rows = []
643
+ rows.append(self.format_image_governor_row())
 
644
 
645
+ for _, row in self.aligned_sentences_df.iterrows():
646
  if row["input"] is None:
647
  continue
648
 
649
  if row["source"] is None:
650
  equal_idx_1 = equal_idx_2 = []
 
651
  else:
652
  # Get index of equal phrases in input and source sentences
653
  equal_idx_1, equal_idx_2 = extract_equal_text(
 
664
  ],
665
  )
666
 
667
+ formatted_row = self.format_text_governor_row()
668
  rows.append(formatted_row)
669
 
670
  table = "\n".join(rows)
 
691
  <style>
692
  """
693
 
694
+ def format_text_governor_row(self):
695
  input_sentences = ""
696
  source_sentences = ""
697
  source_text_urls = ""
 
702
  if row[0]["input"] is None:
703
  continue
704
 
705
+ if row[0]["source"] is not None: # source is not empty
 
 
706
  # highlight entities
707
  input_sentence, highlight_idx_input = apply_highlight(
708
  row[0]["input"],
 
774
  </tr>
775
  """
776
 
777
+ def format_image_governor_row(self):
778
  if (
779
  self.image_referent_url is not None
780
  or self.image_referent_url != ""
 
798
  return entity_count_text
799
 
800
  def color_text(self, text, colored_idx, highlighted_idx):
801
+ sentence = ""
802
  words = text.split()
803
 
804
  starts, ends = self.extract_starts_ends(colored_idx)
 
806
 
807
  previous_end = 0
808
  for start, end in zip(starts, ends):
809
+ sentence += " ".join(words[previous_end:start])
810
 
811
  equal_words = " ".join(words[start:end])
812
+ sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
813
 
814
  previous_end = end
815
 
816
+ sentence += " ".join(words[previous_end:])
817
 
818
+ return sentence
819
 
820
  def extract_starts_ends(self, colored_idx):
821
  starts = []
src/application/text/entity.py CHANGED
@@ -362,14 +362,14 @@ set to take office on Monday, could potentially reduce aid.
362
  """
363
  if __name__ == "__main__":
364
  with gr.Blocks() as demo:
365
- gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
366
  text1_input = gr.Textbox(
367
- label="Paragraph 1",
368
  lines=5,
369
  value=original_text,
370
  )
371
  text2_input = gr.Textbox(
372
- label="Paragraph 2",
373
  lines=5,
374
  value=compared_text,
375
  )
 
362
  """
363
  if __name__ == "__main__":
364
  with gr.Blocks() as demo:
365
+ gr.Markdown("### Highlight Matching Parts Between Two Texts")
366
  text1_input = gr.Textbox(
367
+ label="Text 1",
368
  lines=5,
369
  value=original_text,
370
  )
371
  text2_input = gr.Textbox(
372
+ label="Text 2",
373
  lines=5,
374
  value=compared_text,
375
  )
src/application/text/helper.py CHANGED
@@ -61,7 +61,7 @@ def get_keywords(text, num_keywords=5):
61
 
62
 
63
  def get_important_sentences(
64
- paragraph: str,
65
  keywords: list[str],
66
  num_sentences: int = 3,
67
  ) -> list[str]:
@@ -69,16 +69,16 @@ def get_important_sentences(
69
  Selects important sentences based on a list of keywords.
70
 
71
  Args:
72
- paragraph (str): The input paragraph.
73
  keywords (list[str]): List of important keywords.
74
  num_sentences (int): Number of sentences to return (default is 3).
75
 
76
  Returns:
77
  list: A list of important sentences.
78
  """
79
- # Clean and split the paragraph into sentences
80
  sentences = [
81
- s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
82
  ]
83
 
84
  # Calculate the importance score for each sentence
@@ -103,7 +103,7 @@ def get_important_sentences(
103
 
104
 
105
  def extract_important_phrases(
106
- paragraph: str,
107
  keywords: list[str],
108
  phrase_length: int = 5,
109
  ) -> list[str]:
@@ -112,20 +112,20 @@ def extract_important_phrases(
112
  Phrase length is auto-determined, and overlapped parts are less than 20%.
113
 
114
  Args:
115
- paragraph (str): The input paragraph.
116
  keywords (list[str]): List of important keywords.
117
  phrase_length (int): Length of phrases to extract (default: 5 words).
118
 
119
  Returns:
120
  list: A list of important phrases.
121
  """
122
- # Tokenize the paragraph into words
123
- words = word_tokenize(paragraph.lower())
124
 
125
  # Determine phrase length (between 3 and 7 words)
126
  phrase_length = min(max(len(words) // 10, 5), 7)
127
 
128
- # Generate n-grams (phrases) from the paragraph
129
  phrases = list(ngrams(words, phrase_length))
130
 
131
  important_phrases = []
 
61
 
62
 
63
  def get_important_sentences(
64
+ sentence: str,
65
  keywords: list[str],
66
  num_sentences: int = 3,
67
  ) -> list[str]:
 
69
  Selects important sentences based on a list of keywords.
70
 
71
  Args:
72
+ sentence (str): The input sentence.
73
  keywords (list[str]): List of important keywords.
74
  num_sentences (int): Number of sentences to return (default is 3).
75
 
76
  Returns:
77
  list: A list of important sentences.
78
  """
79
+ # Clean and split the sentence into sentences
80
  sentences = [
81
+ s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
82
  ]
83
 
84
  # Calculate the importance score for each sentence
 
103
 
104
 
105
  def extract_important_phrases(
106
+ text: str,
107
  keywords: list[str],
108
  phrase_length: int = 5,
109
  ) -> list[str]:
 
112
  Phrase length is auto-determined, and overlapped parts are less than 20%.
113
 
114
  Args:
115
+ text (str): The input text.
116
  keywords (list[str]): List of important keywords.
117
  phrase_length (int): Length of phrases to extract (default: 5 words).
118
 
119
  Returns:
120
  list: A list of important phrases.
121
  """
122
+ # Tokenize the text into words
123
+ words = word_tokenize(text.lower())
124
 
125
  # Determine phrase length (between 3 and 7 words)
126
  phrase_length = min(max(len(words) // 10, 5), 7)
127
 
128
+ # Generate n-grams (phrases) from the text
129
  phrases = list(ngrams(words, phrase_length))
130
 
131
  important_phrases = []
src/application/text/highlight_text.py CHANGED
@@ -57,7 +57,7 @@ def generate_color(index, total_colors=20):
57
 
58
 
59
  def highlight_pairs(text1, text2):
60
- """Highlight matching pairs between two paragraphs"""
61
  # Predefined matching pairs
62
  match_pairs = [
63
  {
@@ -145,7 +145,7 @@ def highlight_pairs(text1, text2):
145
  highlighted_text += text[prev_end:]
146
  return highlighted_text
147
 
148
- # Apply highlighting to both paragraphs using the global MATCH_PAIRS
149
  highlighted_text1 = apply_highlight(
150
  text1,
151
  match_pairs,
@@ -171,9 +171,9 @@ if __name__ == "__main__":
171
  text1 = ""
172
 
173
  with gr.Blocks() as demo:
174
- gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
175
  text1_input = gr.Textbox(
176
- label="Paragraph 1",
177
  lines=5,
178
  value="""
179
  The field of deep learning is advancing rapidly.
@@ -181,7 +181,7 @@ Modern neural networks are improving AI research significantly.
181
  """,
182
  )
183
  text2_input = gr.Textbox(
184
- label="Paragraph 2",
185
  lines=5,
186
  value="""
187
  Advances in deep learning have led to breakthroughs in AI research.
 
57
 
58
 
59
  def highlight_pairs(text1, text2):
60
+ """Highlight matching pairs between two texts"""
61
  # Predefined matching pairs
62
  match_pairs = [
63
  {
 
145
  highlighted_text += text[prev_end:]
146
  return highlighted_text
147
 
148
+ # Apply highlighting to both texts using the global MATCH_PAIRS
149
  highlighted_text1 = apply_highlight(
150
  text1,
151
  match_pairs,
 
171
  text1 = ""
172
 
173
  with gr.Blocks() as demo:
174
+ gr.Markdown("### Highlight Matching Parts Between Two texts")
175
  text1_input = gr.Textbox(
176
+ label="Text 1",
177
  lines=5,
178
  value="""
179
  The field of deep learning is advancing rapidly.
 
181
  """,
182
  )
183
  text2_input = gr.Textbox(
184
+ label="Text 2",
185
  lines=5,
186
  value="""
187
  Advances in deep learning have led to breakthroughs in AI research.
src/application/text/preprocessing.py CHANGED
@@ -1,7 +1,7 @@
1
  from nltk.tokenize import sent_tokenize
2
 
3
 
4
- def split_into_paragraphs(input_text):
5
  """
6
  Splits input text into sentences by newlines.
7
 
@@ -21,3 +21,26 @@ def split_into_paragraphs(input_text):
21
  if paragraph and paragraph != "\n":
22
  sentences.extend(sent_tokenize(paragraph))
23
  return sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from nltk.tokenize import sent_tokenize
2
 
3
 
4
+ def split_into_sentences(input_text):
5
  """
6
  Splits input text into sentences by newlines.
7
 
 
21
  if paragraph and paragraph != "\n":
22
  sentences.extend(sent_tokenize(paragraph))
23
  return sentences
24
+
25
+
26
+ def split_into_paragraphs(input_text):
27
+ """
28
+ Splits input text into sentences by newlines.
29
+
30
+ Args:
31
+ input_text: The input text as a string.
32
+
33
+ Returns:
34
+ A list of sentences. Returns an empty list if input is not valid.
35
+ """
36
+ if not isinstance(input_text, str):
37
+ return []
38
+
39
+ paragraphs = input_text.splitlines(keepends=True)
40
+ out_paragraphs = []
41
+ for paragraph in paragraphs:
42
+ paragraph = paragraph.strip()
43
+ if paragraph and paragraph != "\n":
44
+ out_paragraphs.append(paragraph)
45
+ print(f"paragraphs: {out_paragraphs}")
46
+ return out_paragraphs
src/application/text/search_detection.py CHANGED
@@ -9,7 +9,7 @@ from sentence_transformers import (
9
  util,
10
  )
11
 
12
- from src.application.text.preprocessing import split_into_paragraphs
13
  from src.application.text.search import (
14
  generate_search_phrases,
15
  search_by_google,
@@ -38,7 +38,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.5
38
  MAX_CHAR_SIZE = 30000
39
 
40
 
41
- def find_paragraph_source(text, text_index, sentences_df):
42
 
43
  checked_urls = set()
44
  searched_phrases = generate_search_phrases(text[text_index])
@@ -63,14 +63,14 @@ def find_paragraph_source(text, text_index, sentences_df):
63
  print("\t\t\t↑↑↑ Title or text not found")
64
  continue
65
 
66
- page_text = content.title + "\n" + content.text
67
- if len(page_text) > MAX_CHAR_SIZE:
68
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
69
  continue
70
  print(f"\t\t\t↑↑↑ Title: {content.title}")
71
  aligned_sentence = check_paraphrase(
72
  text[text_index],
73
- page_text,
74
  url,
75
  )
76
 
@@ -105,7 +105,7 @@ def find_paragraph_source(text, text_index, sentences_df):
105
  # find matched content in new url
106
  aligned_sentence = check_paraphrase(
107
  text[idx],
108
- page_text,
109
  url,
110
  )
111
 
@@ -222,7 +222,7 @@ def check_sentence(
222
  return False
223
 
224
 
225
- def check_paraphrase(input_text, page_text, url):
226
  """
227
  Checks if the input text is paraphrased in the content at the given URL.
228
 
@@ -237,30 +237,30 @@ def check_paraphrase(input_text, page_text, url):
237
  """
238
 
239
  # Extract sentences from input text and web page
240
- input_paragraphs = [input_text]
241
 
242
- if not page_text:
243
  return {}
244
 
245
- page_paragraphs = split_into_paragraphs(page_text)
246
- if not input_paragraphs or not page_paragraphs:
247
  return {}
248
 
249
  additional_sentences = []
250
- for sentence in page_paragraphs:
251
  if ", external" in sentence:
252
  additional_sentences.append(sentence.replace(", external", ""))
253
- page_paragraphs.extend(additional_sentences)
254
 
255
  # Encode sentences into embeddings
256
  embeddings1 = PARAPHASE_MODEL.encode(
257
- input_paragraphs,
258
  convert_to_tensor=True,
259
  device=DEVICE,
260
  show_progress_bar=False,
261
  )
262
  embeddings2 = PARAPHASE_MODEL.encode(
263
- page_paragraphs,
264
  convert_to_tensor=True,
265
  device=DEVICE,
266
  show_progress_bar=False,
@@ -270,23 +270,31 @@ def check_paraphrase(input_text, page_text, url):
270
  similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
271
 
272
  # Find sentence alignments
273
- alignment = {}
274
- for i, paragraph in enumerate(input_paragraphs):
 
 
 
275
  max_sim_index = np.argmax(similarity_matrix[i])
276
  max_similarity = similarity_matrix[i][max_sim_index]
277
-
278
- label, is_paraphrased = determine_label(max_similarity)
279
- best_matched_paragraph = page_paragraphs[max_sim_index]
280
-
281
- alignment = {
282
- "input": paragraph,
283
- "source": best_matched_paragraph,
284
- "similarity": max_similarity,
 
 
 
 
 
285
  "label": label,
286
  "paraphrase": is_paraphrased,
287
  "url": url,
288
  }
289
- print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
290
 
291
  return alignment
292
 
 
9
  util,
10
  )
11
 
12
+ from src.application.text.preprocessing import split_into_sentences
13
  from src.application.text.search import (
14
  generate_search_phrases,
15
  search_by_google,
 
38
  MAX_CHAR_SIZE = 30000
39
 
40
 
41
+ def find_sentence_source(text, text_index, sentences_df):
42
 
43
  checked_urls = set()
44
  searched_phrases = generate_search_phrases(text[text_index])
 
63
  print("\t\t\t↑↑↑ Title or text not found")
64
  continue
65
 
66
+ source_text = content.title + "\n" + content.text
67
+ if len(source_text) > MAX_CHAR_SIZE:
68
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
69
  continue
70
  print(f"\t\t\t↑↑↑ Title: {content.title}")
71
  aligned_sentence = check_paraphrase(
72
  text[text_index],
73
+ source_text,
74
  url,
75
  )
76
 
 
105
  # find matched content in new url
106
  aligned_sentence = check_paraphrase(
107
  text[idx],
108
+ source_text,
109
  url,
110
  )
111
 
 
222
  return False
223
 
224
 
225
+ def check_paraphrase(input_text, source_text, url):
226
  """
227
  Checks if the input text is paraphrased in the content at the given URL.
228
 
 
237
  """
238
 
239
  # Extract sentences from input text and web page
240
+ input_sentences = split_into_sentences(input_text)
241
 
242
+ if not source_text:
243
  return {}
244
 
245
+ source_sentences = split_into_sentences(source_text)
246
+ if not input_sentences or not source_sentences:
247
  return {}
248
 
249
  additional_sentences = []
250
+ for sentence in source_sentences:
251
  if ", external" in sentence:
252
  additional_sentences.append(sentence.replace(", external", ""))
253
+ source_sentences.extend(additional_sentences)
254
 
255
  # Encode sentences into embeddings
256
  embeddings1 = PARAPHASE_MODEL.encode(
257
+ input_sentences,
258
  convert_to_tensor=True,
259
  device=DEVICE,
260
  show_progress_bar=False,
261
  )
262
  embeddings2 = PARAPHASE_MODEL.encode(
263
+ source_sentences,
264
  convert_to_tensor=True,
265
  device=DEVICE,
266
  show_progress_bar=False,
 
270
  similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
271
 
272
  # Find sentence alignments
273
+ inputs = ""
274
+ sources = ""
275
+ similarities = []
276
+
277
+ for i, sentence in enumerate(input_sentences):
278
  max_sim_index = np.argmax(similarity_matrix[i])
279
  max_similarity = similarity_matrix[i][max_sim_index]
280
+ best_matched_sentence = source_sentences[max_sim_index]
281
+
282
+ inputs += sentence + " "
283
+ sources += best_matched_sentence + " "
284
+ similarities.append(max_similarity)
285
+
286
+
287
+ similarity = sum(similarities) / len(similarities)
288
+ label, is_paraphrased = determine_label(max_similarity)
289
+ alignment = {
290
+ "input": inputs,
291
+ "source": sources,
292
+ "similarity": similarity,
293
  "label": label,
294
  "paraphrase": is_paraphrased,
295
  "url": url,
296
  }
297
+ print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
298
 
299
  return alignment
300
 
test.py CHANGED
@@ -1,14 +1,74 @@
1
- import pandas as pd
2
 
3
- # Assuming your CSV file is named 'data.csv'
4
- try:
5
- df = pd.read_csv('data/bbc_news_4o_mini.csv')
6
- # df = pd.read_csv('data/MAGE_4o_mini.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- print(df.columns) # header names
9
- print(len(df))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- except FileNotFoundError:
12
- print("Error: data.csv not found")
13
- except Exception as e:
14
- print(f"An error occurred: {e}")
 
1
+ import re
2
 
3
+ def is_newline_after_text(text1, text2):
4
+ """
5
+ Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
6
+
7
+ Args:
8
+ text1: The text to search for.
9
+ text2: The text to search within.
10
+
11
+ Returns:
12
+ A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
13
+ """
14
+
15
+ match = re.search(re.escape(text1), text2) #escape text1 to handle special characters
16
+
17
+ if match:
18
+ # Find the next non-space character
19
+ next_char_index = match.end()
20
+ while next_char_index < len(text2) and text2[next_char_index].isspace():
21
+ next_char_index += 1
22
+
23
+ if text2[next_char_index:next_char_index+2] == r'\n':
24
+ print("newline found")
25
+ if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
26
+ return True
27
+
28
+ return False
29
+
30
+ def is_newline_after_text_2(text1, text2):
31
+ """
32
+ Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
33
+
34
+ Args:
35
+ text1: The text to search for.
36
+ text2: The text to search within.
37
+
38
+ Returns:
39
+ True if next char is newline
40
+ """
41
+ text2 = text2.replace("\n", "\\n")
42
 
43
+ ater_text = text2.split(text1)
44
+ if len(ater_text) > 1:
45
+ ater_text = ater_text[1].lstrip() # Remove spaces
46
+ if ater_text.startswith('\n'):
47
+ return True
48
+ return False
49
+
50
+ # Example usage:
51
+ text1 = "hello"
52
+ text2 = "some text hello \nmore text"
53
+ result = is_newline_after_text_2(text1, text2)
54
+ print(f"Next char is newline: {result}\n")
55
+
56
+ text1 = "hello"
57
+ text2 = "some text hello more text"
58
+ result = is_newline_after_text_2(text1, text2)
59
+ print(f"Next char is newline: {result}\n")
60
+
61
+ text1 = "hello"
62
+ text2 = "some text hello \nmore text"
63
+ result = is_newline_after_text_2(text1, text2)
64
+ print(f"Next char is newline: {result}\n")
65
+
66
+ text1 = "hello"
67
+ text2 = "some text hello\t\nmore text" #test tab space before newline
68
+ result = is_newline_after_text_2(text1, text2)
69
+ print(f"Next char is newline: {result}\n")
70
 
71
+ text1 = "hello." #test special characters
72
+ text2 = "some text hello. \nmore text"
73
+ result = is_newline_after_text_2(text1, text2)
74
+ print(f"Next char is newline: {result}\n")