pmkhanh7890 commited on
Commit
bfe6692
·
1 Parent(s): 62dc9d8

pre-commit

Browse files
gpt_test.py CHANGED
@@ -1,28 +1,35 @@
 
1
  import os
2
 
3
  from dotenv import load_dotenv
4
- from openai import AzureOpenAI, OpenAIError
 
 
 
5
 
6
 
7
- import csv
8
-
9
  def get_first_column(csv_filepath):
10
  """
11
- Reads a CSV file with a header and returns a list containing only the
12
  values from the first column.
13
 
14
  Args:
15
  csv_filepath: The path to the CSV file.
16
 
17
  Returns:
18
- A list of strings, where each string is a value from the first
19
- column of the CSV file. Returns an empty list if there's an error
20
- opening or reading the file, or if the file has no rows after the header.
 
21
  Prints an error message to the console in case of file errors.
22
  """
23
  first_column_values = []
24
  try:
25
- with open(csv_filepath, 'r', newline='', encoding='utf-8') as csvfile: # Handle potential encoding issues
 
 
 
 
26
  reader = csv.reader(csvfile)
27
  next(reader, None) # Skip the header row (if it exists)
28
 
@@ -32,21 +39,29 @@ def get_first_column(csv_filepath):
32
 
33
  except FileNotFoundError:
34
  print(f"Error: File not found at {csv_filepath}")
35
- except Exception as e: # Catch other potential errors (e.g., UnicodeDecodeError)
 
 
36
  print(f"An error occurred: {e}")
37
-
38
  return first_column_values
39
 
 
40
  def add_text_to_csv(csv_filepath, text_to_add, index=0):
41
  """
42
  Adds text to a single-column CSV file (UTF-8 encoding).
43
 
44
  Args:
45
  csv_filepath: The path to the CSV file.
46
- text_to_add: The text to append to the CSV file (one value per new row).
47
  """
48
  try:
49
- with open(csv_filepath, 'a', newline='', encoding='utf-8') as csvfile: # 'a' for append mode
 
 
 
 
 
50
  writer = csv.writer(csvfile)
51
 
52
  # Check if file is empty to determine if header needs to be written
@@ -58,13 +73,18 @@ def add_text_to_csv(csv_filepath, text_to_add, index=0):
58
 
59
  if isinstance(text_to_add, list): # Check if text_to_add is a list
60
  for text_item in text_to_add:
61
- writer.writerow([index, text_item]) # Write text_item as a single-element row
 
 
62
  else: # If not a list, assume it's a single string
63
- writer.writerow([index, text_to_add]) # Write text_to_add as a single-element row
 
 
64
 
65
  except Exception as e:
66
  print(f"An error occurred: {e}")
67
 
 
68
  load_dotenv()
69
  AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
70
  AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
@@ -76,25 +96,25 @@ azure_client = AzureOpenAI(
76
  api_version="2024-05-01-preview",
77
  )
78
 
79
- deplopment_name = "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
80
  TEXT_PROMPT = """
81
  Paraphrase the following news, only output the paraphrased text:
82
 
83
  """
84
  text = get_first_column("data/MAGE.csv")
85
- count = 0
86
  for index, news in enumerate(text):
87
  if count > 1000:
88
  break
89
  prompt = TEXT_PROMPT + news
90
  print(f"{index:5}:\t{news[:50]}")
91
- #print(f"{index:5}:\t{prompt}")
92
-
93
  try:
94
  response = azure_client.chat.completions.create(
95
  model=deplopment_name, # model = "deployment_name".
96
  messages=[
97
- # {"role": "system", "content": "You are a helpful assistant."},
98
  {"role": "user", "content": prompt},
99
  ],
100
  # max_tokens=512,
@@ -103,8 +123,8 @@ for index, news in enumerate(text):
103
  except OpenAIError as e:
104
  print(f"Error interacting with OpenAI API: {e}")
105
  continue
106
-
107
  count += 1
108
  paraphrased_news = response.choices[0].message.content
109
-
110
  add_text_to_csv("data/MAGE_4o_mini.csv", paraphrased_news, count)
 
1
+ import csv
2
  import os
3
 
4
  from dotenv import load_dotenv
5
+ from openai import (
6
+ AzureOpenAI,
7
+ OpenAIError,
8
+ )
9
 
10
 
 
 
11
  def get_first_column(csv_filepath):
12
  """
13
+ Reads a CSV file with a header and returns a list containing only the
14
  values from the first column.
15
 
16
  Args:
17
  csv_filepath: The path to the CSV file.
18
 
19
  Returns:
20
+ A list of strings, where each string is a value from the first
21
+ column of the CSV file.
22
+ Returns an empty list if there's an error opening or reading
23
+ the file, or if the file has no rows after the header.
24
  Prints an error message to the console in case of file errors.
25
  """
26
  first_column_values = []
27
  try:
28
+ with open(
29
+ csv_filepath,
30
+ newline="",
31
+ encoding="utf-8",
32
+ ) as csvfile: # Handle potential encoding issues
33
  reader = csv.reader(csvfile)
34
  next(reader, None) # Skip the header row (if it exists)
35
 
 
39
 
40
  except FileNotFoundError:
41
  print(f"Error: File not found at {csv_filepath}")
42
+ except (
43
+ Exception
44
+ ) as e: # Catch other potential errors (e.g., UnicodeDecodeError)
45
  print(f"An error occurred: {e}")
46
+
47
  return first_column_values
48
 
49
+
50
  def add_text_to_csv(csv_filepath, text_to_add, index=0):
51
  """
52
  Adds text to a single-column CSV file (UTF-8 encoding).
53
 
54
  Args:
55
  csv_filepath: The path to the CSV file.
56
+ text_to_add: The text to append to CSV file (one value per new row).
57
  """
58
  try:
59
+ with open(
60
+ csv_filepath,
61
+ "a",
62
+ newline="",
63
+ encoding="utf-8",
64
+ ) as csvfile: # 'a' for append mode
65
  writer = csv.writer(csvfile)
66
 
67
  # Check if file is empty to determine if header needs to be written
 
73
 
74
  if isinstance(text_to_add, list): # Check if text_to_add is a list
75
  for text_item in text_to_add:
76
+ writer.writerow(
77
+ [index, text_item],
78
+ ) # Write text_item as a single-element row
79
  else: # If not a list, assume it's a single string
80
+ writer.writerow(
81
+ [index, text_to_add],
82
+ ) # Write text_to_add as a single-element row
83
 
84
  except Exception as e:
85
  print(f"An error occurred: {e}")
86
 
87
+
88
  load_dotenv()
89
  AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
90
  AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
 
96
  api_version="2024-05-01-preview",
97
  )
98
 
99
+ deplopment_name = "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
100
  TEXT_PROMPT = """
101
  Paraphrase the following news, only output the paraphrased text:
102
 
103
  """
104
  text = get_first_column("data/MAGE.csv")
105
+ count = 0
106
  for index, news in enumerate(text):
107
  if count > 1000:
108
  break
109
  prompt = TEXT_PROMPT + news
110
  print(f"{index:5}:\t{news[:50]}")
111
+ # print(f"{index:5}:\t{prompt}")
112
+
113
  try:
114
  response = azure_client.chat.completions.create(
115
  model=deplopment_name, # model = "deployment_name".
116
  messages=[
117
+ # {"role": "system", "content": "You're an assistant."},
118
  {"role": "user", "content": prompt},
119
  ],
120
  # max_tokens=512,
 
123
  except OpenAIError as e:
124
  print(f"Error interacting with OpenAI API: {e}")
125
  continue
126
+
127
  count += 1
128
  paraphrased_news = response.choices[0].message.content
129
+
130
  add_text_to_csv("data/MAGE_4o_mini.csv", paraphrased_news, count)
src/application/content_detection.py CHANGED
@@ -1,6 +1,5 @@
1
  from difflib import SequenceMatcher
2
 
3
- import numpy as np
4
  import pandas as pd
5
 
6
  from src.application.image.image_detection import (
@@ -13,7 +12,10 @@ from src.application.text.entity import (
13
  highlight_entities,
14
  )
15
  from src.application.text.helper import extract_equal_text
16
- from src.application.text.model_detection import detect_text_by_ai_model, predict_generation_model
 
 
 
17
  from src.application.text.preprocessing import split_into_paragraphs
18
  from src.application.text.search_detection import (
19
  PARAPHRASE_THRESHOLD_MACHINE,
@@ -30,17 +32,17 @@ class NewsVerification:
30
 
31
  self.text_prediction_label: list[str] = ["UNKNOWN"]
32
  self.text_prediction_score: list[float] = [0.0]
33
-
34
  self.image_prediction_label: list[str] = ["UNKNOWN"]
35
  self.image_prediction_score: list[str] = [0.0]
36
  self.image_referent_url: list[str] = []
37
-
38
  self.news_prediction_label = ""
39
  self.news_prediction_score = -1
40
 
41
  # news' urls to find img
42
  self.found_img_url: list[str] = []
43
-
44
  # Analyzed results
45
  self.aligned_paragraphs_df: pd.DataFrame = pd.DataFrame(
46
  columns=[
@@ -69,24 +71,26 @@ class NewsVerification:
69
 
70
  def determine_text_origin(self):
71
  self.find_text_source()
72
-
73
  # Group inout and source by url
74
  def concat_text(series):
75
- return ' '.join(series.astype(str).tolist()) #Handle mixed data types and NaNs
76
-
77
- self.grouped_url_df = self.aligned_paragraphs_df.groupby('url').agg(
 
 
78
  {
79
- 'input': concat_text,
80
- 'source': concat_text,
81
- }
82
- )
83
  self.grouped_url_df = self.grouped_url_df.reset_index()
84
  # Add new columns for label and score
85
  self.grouped_url_df["label"] = None
86
  self.grouped_url_df["score"] = None
87
-
88
  print(f"aligned_paragraphs_df:\n {self.aligned_paragraphs_df}")
89
-
90
  for index, row in self.grouped_url_df.iterrows():
91
  label, score = self.verify_text(row["url"])
92
  if label == "UNKNOWN":
@@ -95,18 +99,21 @@ class NewsVerification:
95
 
96
  # detect by baseline model
97
  label, score = detect_text_by_ai_model(text)
98
-
99
  self.grouped_url_df.at[index, "label"] = label
100
  self.grouped_url_df.at[index, "score"] = score
101
 
102
  # Overall label or score for the whole input text
103
  if len(self.grouped_url_df) > 0:
104
- # filter self.aligned_paragraphs_df["label"] if inclucind substring MACHINE
105
  machine_label = self.grouped_url_df[
106
- self.grouped_url_df["label"].str.contains("MACHINE", case=False, na=False)
 
 
 
 
107
  ]
108
  # machine_label = self.aligned_paragraphs_df[
109
- # self.aligned_paragraphs_df["label"] == "MACHINE"
110
  # ]
111
  if len(machine_label) > 0:
112
  label = " ".join(machine_label["label"].tolist())
@@ -118,14 +125,14 @@ class NewsVerification:
118
  ]
119
  self.text_prediction_label[0] = "HUMAN"
120
  self.text_prediction_score[0] = machine_label["score"].mean()
121
- else: # no source found in the input text
122
  print("No source found in the input text")
123
  text = " ".join(self.aligned_paragraphs_df["input"].tolist())
124
  # detect by baseline model
125
- label, score = detect_text_by_ai_model(text)
126
  self.text_prediction_label[0] = label
127
  self.text_prediction_score[0] = score
128
-
129
  def find_text_source(self):
130
  """
131
  Determines the origin of the given text based on paraphrasing detection
@@ -148,15 +155,22 @@ class NewsVerification:
148
 
149
  for _ in range(len(input_sentences)):
150
  self.aligned_paragraphs_df = pd.concat(
151
- [self.aligned_paragraphs_df, pd.DataFrame([{
152
- "input": None,
153
- "source": None,
154
- "label": None,
155
- "similarity": None,
156
- "paraphrase": None,
157
- "url": None,
158
- "entities": None,
159
- }])],
 
 
 
 
 
 
 
160
  ignore_index=True,
161
  )
162
 
@@ -183,7 +197,8 @@ class NewsVerification:
183
  def verify_text(self, url):
184
  label = "UNKNOWN"
185
  score = 0
186
- # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
 
187
  filtered_by_url = self.aligned_paragraphs_df[
188
  self.aligned_paragraphs_df["url"] == url
189
  ]
@@ -192,17 +207,24 @@ class NewsVerification:
192
  ]
193
  if len(filtered_by_similarity) / len(self.aligned_paragraphs_df) > 0.5:
194
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
195
- contains_machine = filtered_by_similarity["label"].str.contains(
196
- "MACHINE", case=False, na=False
197
- ).any()
 
 
 
 
 
 
198
  if contains_machine:
199
  label = "MACHINE"
200
  machine_rows = filtered_by_similarity[
201
  filtered_by_similarity["label"].str.contains(
202
  "MACHINE",
203
  case=False,
204
- na=False)
205
- ]
 
206
  generated_model, _ = predict_generation_model(self.news_text)
207
  label += f"<br>({generated_model})"
208
  score = machine_rows["similarity"].mean()
@@ -212,12 +234,12 @@ class NewsVerification:
212
  filtered_by_similarity["label"].str.contains(
213
  "HUMAN",
214
  case=False,
215
- na=False)
216
- ]
 
217
  score = human_rows["similarity"].mean()
218
-
219
  return label, score
220
-
221
 
222
  def determine_image_origin(self):
223
  print("CHECK IMAGE:")
@@ -267,14 +289,14 @@ class NewsVerification:
267
  self.determine_image_origin()
268
 
269
  def analyze_details(self):
270
- self.handle_entities()
271
  ordinary_user_table = self.create_ordinary_user_table()
272
  fact_checker_table = self.create_fact_checker_table()
273
  governor_table = self.create_governor_table()
274
 
275
  return ordinary_user_table, fact_checker_table, governor_table
276
-
277
- def handle_entities(self):
278
  entities_with_colors = []
279
  for index, row in self.grouped_url_df.iterrows():
280
  # Get entity-words (in pair) with colors
@@ -283,12 +305,11 @@ class NewsVerification:
283
  row["source"],
284
  )
285
 
286
- #self.grouped_url_df.at[index, "entities"] = entities_with_colors # must use at
287
-
288
  for index, paragraph in self.aligned_paragraphs_df.iterrows():
289
  if paragraph["url"] == row["url"]:
290
- self.aligned_paragraphs_df.at[index, "entities"] = entities_with_colors # must use at
291
-
 
292
 
293
  def get_text_urls(self):
294
  return set(self.text_referent_url)
@@ -336,13 +357,13 @@ class NewsVerification:
336
  rows.append(self.format_image_fact_checker_row(max_length))
337
 
338
  for _, row in self.aligned_paragraphs_df.iterrows():
339
- if row["input"] == None:
340
  continue
341
-
342
- if row["source"] == None:
343
  equal_idx_1 = equal_idx_2 = []
344
-
345
- else: # Get index of equal phrases in input and source sentences
346
  equal_idx_1, equal_idx_2 = extract_equal_text(
347
  row["input"],
348
  row["source"],
@@ -354,33 +375,42 @@ class NewsVerification:
354
  equal_idx_1,
355
  equal_idx_2,
356
  row["entities"],
357
- row["url"]
358
  ],
359
  )
360
-
361
  previous_url = None
362
  span_row = 1
363
- for index, row in enumerate(self.fact_checker_table):
364
  current_url = row[4]
365
  last_url_row = False
366
-
367
  # First row or URL change
368
  if index == 0 or current_url != previous_url:
369
  first_url_row = True
370
  previous_url = current_url
371
  # Increase counter "span_row" when the next url is the same
372
- while index + span_row < len(self.fact_checker_table) \
373
- and self.fact_checker_table[index + span_row][4] == current_url:
 
 
 
374
  span_row += 1
375
-
376
  else:
377
  first_url_row = False
378
  span_row -= 1
379
-
380
  if span_row == 1:
381
  last_url_row = True
382
-
383
- formatted_row = self.format_text_fact_checker_row(row, first_url_row, last_url_row, span_row, max_length)
 
 
 
 
 
 
384
  rows.append(formatted_row)
385
 
386
  table = "\n".join(rows)
@@ -436,7 +466,7 @@ class NewsVerification:
436
  source_sentence = row[0]["source"]
437
  highlight_idx_input = []
438
  highlight_idx_source = []
439
-
440
  if row[3] is not None:
441
  entity_count = len(row[3])
442
 
@@ -453,7 +483,7 @@ class NewsVerification:
453
  ) # text, index of highlight words
454
 
455
  # Replace _ to get correct formatting
456
- # Original one having _ for correct word counting
457
  input_sentence = input_sentence.replace(
458
  "span_style",
459
  "span style",
@@ -468,24 +498,22 @@ class NewsVerification:
468
 
469
  url = row[0]["url"]
470
  # Displayed label and score by url
471
- filterby_url = self.grouped_url_df[
472
- self.grouped_url_df["url"] == url
473
- ]
474
  if len(filterby_url) > 0:
475
  label = filterby_url["label"].values[0]
476
  score = filterby_url["score"].values[0]
477
- else:
478
  label = self.text_prediction_label[0]
479
  score = self.text_prediction_score[0]
480
 
481
  # Format displayed url
482
-
483
  short_url = self.shorten_url(url, max_length)
484
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
485
 
486
  # Format displayed entity count
487
  entity_count_text = self.get_entity_count_text(entity_count)
488
-
489
  border_top = "border-top: 1px solid transparent;"
490
  border_bottom = "border-bottom: 1px solid transparent;"
491
  if first_url_row is True:
@@ -580,7 +608,7 @@ class NewsVerification:
580
  source_text_urls = ""
581
  urls = []
582
  for _, row in self.aligned_paragraphs_df.iterrows():
583
- if row["input"] == None:
584
  continue
585
  input_sentences += row["input"] + "<br><br>"
586
  url = row["url"]
@@ -620,13 +648,13 @@ class NewsVerification:
620
  rows.append(self.format_image_governor_row(max_length))
621
 
622
  for _, row in self.aligned_paragraphs_df.iterrows():
623
- if row["input"] == None:
624
  continue
625
-
626
- if row["source"] == None:
627
  equal_idx_1 = equal_idx_2 = []
628
-
629
- else:
630
  # Get index of equal phrases in input and source sentences
631
  equal_idx_1, equal_idx_2 = extract_equal_text(
632
  row["input"],
@@ -680,19 +708,25 @@ class NewsVerification:
680
  if row[0]["input"] is None:
681
  continue
682
 
683
- if row[0]["source"] is not None and row[3] is not None: # source is not empty
 
 
684
  # highlight entities
685
  input_sentence, highlight_idx_input = apply_highlight(
686
  row[0]["input"],
687
  row[3], # entities_with_colors
688
  "input", # key
689
- entity_count[-2], # since the last one is for current counting
 
 
690
  )
691
  source_sentence, highlight_idx_source = apply_highlight(
692
  row[0]["source"],
693
  row[3], # entities_with_colors
694
  "source", # key
695
- entity_count[-2], # since the last one is for current counting
 
 
696
  )
697
 
698
  # Color overlapping words
@@ -722,12 +756,11 @@ class NewsVerification:
722
  else:
723
  source_sentence = row[0]["source"]
724
  input_sentence = row[0]["input"]
725
-
726
 
727
  # convert score to HUMAN-based score:
728
  input_sentences += input_sentence + "<br><br>"
729
  source_sentences += source_sentence + "<br><br>"
730
-
731
  url = row[0]["url"]
732
  if url not in urls:
733
  urls.append(url)
@@ -736,7 +769,7 @@ class NewsVerification:
736
  sentence_count += 1
737
  if row[3] is not None:
738
  entity_count.append(len(row[3]))
739
-
740
  entity_count_text = self.get_entity_count_text(sum(entity_count))
741
 
742
  return f"""
@@ -791,7 +824,7 @@ class NewsVerification:
791
 
792
  starts, ends = self.extract_starts_ends(colored_idx)
793
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
794
-
795
  previous_end = 0
796
  for start, end in zip(starts, ends):
797
  paragraph += " ".join(words[previous_end:start])
@@ -892,4 +925,4 @@ class NewsVerification:
892
  starts.append(start)
893
  ends.append(end)
894
 
895
- return starts, ends
 
1
  from difflib import SequenceMatcher
2
 
 
3
  import pandas as pd
4
 
5
  from src.application.image.image_detection import (
 
12
  highlight_entities,
13
  )
14
  from src.application.text.helper import extract_equal_text
15
+ from src.application.text.model_detection import (
16
+ detect_text_by_ai_model,
17
+ predict_generation_model,
18
+ )
19
  from src.application.text.preprocessing import split_into_paragraphs
20
  from src.application.text.search_detection import (
21
  PARAPHRASE_THRESHOLD_MACHINE,
 
32
 
33
  self.text_prediction_label: list[str] = ["UNKNOWN"]
34
  self.text_prediction_score: list[float] = [0.0]
35
+
36
  self.image_prediction_label: list[str] = ["UNKNOWN"]
37
  self.image_prediction_score: list[str] = [0.0]
38
  self.image_referent_url: list[str] = []
39
+
40
  self.news_prediction_label = ""
41
  self.news_prediction_score = -1
42
 
43
  # news' urls to find img
44
  self.found_img_url: list[str] = []
45
+
46
  # Analyzed results
47
  self.aligned_paragraphs_df: pd.DataFrame = pd.DataFrame(
48
  columns=[
 
71
 
72
  def determine_text_origin(self):
73
  self.find_text_source()
74
+
75
  # Group inout and source by url
76
  def concat_text(series):
77
+ return " ".join(
78
+ series.astype(str).tolist(),
79
+ ) # Handle mixed data types and NaNs
80
+
81
+ self.grouped_url_df = self.aligned_paragraphs_df.groupby("url").agg(
82
  {
83
+ "input": concat_text,
84
+ "source": concat_text,
85
+ },
86
+ )
87
  self.grouped_url_df = self.grouped_url_df.reset_index()
88
  # Add new columns for label and score
89
  self.grouped_url_df["label"] = None
90
  self.grouped_url_df["score"] = None
91
+
92
  print(f"aligned_paragraphs_df:\n {self.aligned_paragraphs_df}")
93
+
94
  for index, row in self.grouped_url_df.iterrows():
95
  label, score = self.verify_text(row["url"])
96
  if label == "UNKNOWN":
 
99
 
100
  # detect by baseline model
101
  label, score = detect_text_by_ai_model(text)
102
+
103
  self.grouped_url_df.at[index, "label"] = label
104
  self.grouped_url_df.at[index, "score"] = score
105
 
106
  # Overall label or score for the whole input text
107
  if len(self.grouped_url_df) > 0:
 
108
  machine_label = self.grouped_url_df[
109
+ self.grouped_url_df["label"].str.contains(
110
+ "MACHINE",
111
+ case=False,
112
+ na=False,
113
+ )
114
  ]
115
  # machine_label = self.aligned_paragraphs_df[
116
+ # self.aligned_paragraphs_df["label"] == "MACHINE"
117
  # ]
118
  if len(machine_label) > 0:
119
  label = " ".join(machine_label["label"].tolist())
 
125
  ]
126
  self.text_prediction_label[0] = "HUMAN"
127
  self.text_prediction_score[0] = machine_label["score"].mean()
128
+ else: # no source found in the input text
129
  print("No source found in the input text")
130
  text = " ".join(self.aligned_paragraphs_df["input"].tolist())
131
  # detect by baseline model
132
+ label, score = detect_text_by_ai_model(text)
133
  self.text_prediction_label[0] = label
134
  self.text_prediction_score[0] = score
135
+
136
  def find_text_source(self):
137
  """
138
  Determines the origin of the given text based on paraphrasing detection
 
155
 
156
  for _ in range(len(input_sentences)):
157
  self.aligned_paragraphs_df = pd.concat(
158
+ [
159
+ self.aligned_paragraphs_df,
160
+ pd.DataFrame(
161
+ [
162
+ {
163
+ "input": None,
164
+ "source": None,
165
+ "label": None,
166
+ "similarity": None,
167
+ "paraphrase": None,
168
+ "url": None,
169
+ "entities": None,
170
+ },
171
+ ],
172
+ ),
173
+ ],
174
  ignore_index=True,
175
  )
176
 
 
197
  def verify_text(self, url):
198
  label = "UNKNOWN"
199
  score = 0
200
+ # calculate the average similarity when the similary score
201
+ # in each row of sentences_df is higher than 0.8
202
  filtered_by_url = self.aligned_paragraphs_df[
203
  self.aligned_paragraphs_df["url"] == url
204
  ]
 
207
  ]
208
  if len(filtered_by_similarity) / len(self.aligned_paragraphs_df) > 0.5:
209
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
210
+ contains_machine = (
211
+ filtered_by_similarity["label"]
212
+ .str.contains(
213
+ "MACHINE",
214
+ case=False,
215
+ na=False,
216
+ )
217
+ .any()
218
+ )
219
  if contains_machine:
220
  label = "MACHINE"
221
  machine_rows = filtered_by_similarity[
222
  filtered_by_similarity["label"].str.contains(
223
  "MACHINE",
224
  case=False,
225
+ na=False,
226
+ )
227
+ ]
228
  generated_model, _ = predict_generation_model(self.news_text)
229
  label += f"<br>({generated_model})"
230
  score = machine_rows["similarity"].mean()
 
234
  filtered_by_similarity["label"].str.contains(
235
  "HUMAN",
236
  case=False,
237
+ na=False,
238
+ )
239
+ ]
240
  score = human_rows["similarity"].mean()
241
+
242
  return label, score
 
243
 
244
  def determine_image_origin(self):
245
  print("CHECK IMAGE:")
 
289
  self.determine_image_origin()
290
 
291
  def analyze_details(self):
292
+ self.handle_entities()
293
  ordinary_user_table = self.create_ordinary_user_table()
294
  fact_checker_table = self.create_fact_checker_table()
295
  governor_table = self.create_governor_table()
296
 
297
  return ordinary_user_table, fact_checker_table, governor_table
298
+
299
+ def handle_entities(self):
300
  entities_with_colors = []
301
  for index, row in self.grouped_url_df.iterrows():
302
  # Get entity-words (in pair) with colors
 
305
  row["source"],
306
  )
307
 
 
 
308
  for index, paragraph in self.aligned_paragraphs_df.iterrows():
309
  if paragraph["url"] == row["url"]:
310
+ self.aligned_paragraphs_df.at[index, "entities"] = (
311
+ entities_with_colors # must use at
312
+ )
313
 
314
  def get_text_urls(self):
315
  return set(self.text_referent_url)
 
357
  rows.append(self.format_image_fact_checker_row(max_length))
358
 
359
  for _, row in self.aligned_paragraphs_df.iterrows():
360
+ if row["input"] is None:
361
  continue
362
+
363
+ if row["source"] is None:
364
  equal_idx_1 = equal_idx_2 = []
365
+
366
+ else: # Get index of equal phrases in input and source sentences
367
  equal_idx_1, equal_idx_2 = extract_equal_text(
368
  row["input"],
369
  row["source"],
 
375
  equal_idx_1,
376
  equal_idx_2,
377
  row["entities"],
378
+ row["url"],
379
  ],
380
  )
381
+
382
  previous_url = None
383
  span_row = 1
384
+ for index, row in enumerate(self.fact_checker_table):
385
  current_url = row[4]
386
  last_url_row = False
387
+
388
  # First row or URL change
389
  if index == 0 or current_url != previous_url:
390
  first_url_row = True
391
  previous_url = current_url
392
  # Increase counter "span_row" when the next url is the same
393
+ while (
394
+ index + span_row < len(self.fact_checker_table)
395
+ and self.fact_checker_table[index + span_row][4]
396
+ == current_url
397
+ ):
398
  span_row += 1
399
+
400
  else:
401
  first_url_row = False
402
  span_row -= 1
403
+
404
  if span_row == 1:
405
  last_url_row = True
406
+
407
+ formatted_row = self.format_text_fact_checker_row(
408
+ row,
409
+ first_url_row,
410
+ last_url_row,
411
+ span_row,
412
+ max_length,
413
+ )
414
  rows.append(formatted_row)
415
 
416
  table = "\n".join(rows)
 
466
  source_sentence = row[0]["source"]
467
  highlight_idx_input = []
468
  highlight_idx_source = []
469
+
470
  if row[3] is not None:
471
  entity_count = len(row[3])
472
 
 
483
  ) # text, index of highlight words
484
 
485
  # Replace _ to get correct formatting
486
+ # Original one having _ for correct word counting
487
  input_sentence = input_sentence.replace(
488
  "span_style",
489
  "span style",
 
498
 
499
  url = row[0]["url"]
500
  # Displayed label and score by url
501
+ filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
 
 
502
  if len(filterby_url) > 0:
503
  label = filterby_url["label"].values[0]
504
  score = filterby_url["score"].values[0]
505
+ else:
506
  label = self.text_prediction_label[0]
507
  score = self.text_prediction_score[0]
508
 
509
  # Format displayed url
510
+
511
  short_url = self.shorten_url(url, max_length)
512
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
513
 
514
  # Format displayed entity count
515
  entity_count_text = self.get_entity_count_text(entity_count)
516
+
517
  border_top = "border-top: 1px solid transparent;"
518
  border_bottom = "border-bottom: 1px solid transparent;"
519
  if first_url_row is True:
 
608
  source_text_urls = ""
609
  urls = []
610
  for _, row in self.aligned_paragraphs_df.iterrows():
611
+ if row["input"] is None:
612
  continue
613
  input_sentences += row["input"] + "<br><br>"
614
  url = row["url"]
 
648
  rows.append(self.format_image_governor_row(max_length))
649
 
650
  for _, row in self.aligned_paragraphs_df.iterrows():
651
+ if row["input"] is None:
652
  continue
653
+
654
+ if row["source"] is None:
655
  equal_idx_1 = equal_idx_2 = []
656
+
657
+ else:
658
  # Get index of equal phrases in input and source sentences
659
  equal_idx_1, equal_idx_2 = extract_equal_text(
660
  row["input"],
 
708
  if row[0]["input"] is None:
709
  continue
710
 
711
+ if (
712
+ row[0]["source"] is not None and row[3] is not None
713
+ ): # source is not empty
714
  # highlight entities
715
  input_sentence, highlight_idx_input = apply_highlight(
716
  row[0]["input"],
717
  row[3], # entities_with_colors
718
  "input", # key
719
+ entity_count[
720
+ -2
721
+ ], # since the last one is for current counting
722
  )
723
  source_sentence, highlight_idx_source = apply_highlight(
724
  row[0]["source"],
725
  row[3], # entities_with_colors
726
  "source", # key
727
+ entity_count[
728
+ -2
729
+ ], # since the last one is for current counting
730
  )
731
 
732
  # Color overlapping words
 
756
  else:
757
  source_sentence = row[0]["source"]
758
  input_sentence = row[0]["input"]
 
759
 
760
  # convert score to HUMAN-based score:
761
  input_sentences += input_sentence + "<br><br>"
762
  source_sentences += source_sentence + "<br><br>"
763
+
764
  url = row[0]["url"]
765
  if url not in urls:
766
  urls.append(url)
 
769
  sentence_count += 1
770
  if row[3] is not None:
771
  entity_count.append(len(row[3]))
772
+
773
  entity_count_text = self.get_entity_count_text(sum(entity_count))
774
 
775
  return f"""
 
824
 
825
  starts, ends = self.extract_starts_ends(colored_idx)
826
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
827
+
828
  previous_end = 0
829
  for start, end in zip(starts, ends):
830
  paragraph += " ".join(words[previous_end:start])
 
925
  starts.append(start)
926
  ends.append(end)
927
 
928
+ return starts, ends
src/application/text/entity.py CHANGED
@@ -161,7 +161,7 @@ def assign_colors_to_entities(entities):
161
 
162
 
163
  def highlight_entities(text1, text2):
164
- if text1 == None or text2 == None:
165
  return None
166
 
167
  entities_text = extract_entities_gpt(text1, text2)
 
161
 
162
 
163
  def highlight_entities(text1, text2):
164
+ if text1 is None or text2 is None:
165
  return None
166
 
167
  entities_text = extract_entities_gpt(text1, text2)
src/application/text/helper.py CHANGED
@@ -147,7 +147,7 @@ def extract_equal_text(text1, text2):
147
  text = text.lower()
148
  text = text.translate(str.maketrans("", "", string.punctuation))
149
  return text
150
-
151
  splited_text1 = cleanup(text1).split()
152
  splited_text2 = cleanup(text2).split()
153
 
@@ -163,7 +163,8 @@ def extract_equal_text(text1, text2):
163
  equal_idx_2.append({"start": j1, "end": j2})
164
  # subtext_1 = " ".join(text1[i1:i2])
165
  # subtext_2 = " ".join(text2[j1:j2])
166
- # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
 
167
  return equal_idx_1, equal_idx_2
168
 
169
 
 
147
  text = text.lower()
148
  text = text.translate(str.maketrans("", "", string.punctuation))
149
  return text
150
+
151
  splited_text1 = cleanup(text1).split()
152
  splited_text2 = cleanup(text2).split()
153
 
 
163
  equal_idx_2.append({"start": j1, "end": j2})
164
  # subtext_1 = " ".join(text1[i1:i2])
165
  # subtext_2 = " ".join(text2[j1:j2])
166
+ # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}]
167
+ # {subtext_1!r:>55} --> {subtext_2!r}')
168
  return equal_idx_1, equal_idx_2
169
 
170
 
src/application/text/model_detection.py CHANGED
@@ -1,11 +1,16 @@
1
- from transformers import pipeline
2
  import os
3
 
4
- from dotenv import load_dotenv
5
- from openai import AzureOpenAI, OpenAIError
6
- from sentence_transformers import SentenceTransformer, util
7
  import torch
8
-
 
 
 
 
 
 
 
 
 
9
 
10
  load_dotenv()
11
  AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
@@ -74,7 +79,7 @@ def detect_text_by_ai_model(
74
  return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
75
 
76
 
77
- def predict_generation_model(text:str) -> tuple[str, float]:
78
  """
79
  Predicts if text is generated by gpt-4o or gpt-4o-mini models.
80
  Compare the input text against the paraphrased text by the models.
@@ -94,7 +99,7 @@ def predict_generation_model(text:str) -> tuple[str, float]:
94
  if similarity > best_similarity:
95
  best_similarity = similarity
96
  best_model = model
97
-
98
  return best_model, best_similarity
99
 
100
 
@@ -125,8 +130,9 @@ Paraphrase the following news, only output the paraphrased text:
125
  return paraphrased_text
126
  except OpenAIError as e: # Add exception handling
127
  print(f"Error in AI model inference: {e}")
128
- return None
129
-
 
130
  def measure_text_similarity(text1: str, text2: str) -> float:
131
  """
132
  Measure the similarity between two texts.
@@ -151,4 +157,3 @@ def measure_text_similarity(text1: str, text2: str) -> float:
151
  similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
152
  print(similarity[0][0])
153
  return similarity[0][0]
154
-
 
 
1
  import os
2
 
 
 
 
3
  import torch
4
+ from dotenv import load_dotenv
5
+ from openai import (
6
+ AzureOpenAI,
7
+ OpenAIError,
8
+ )
9
+ from sentence_transformers import (
10
+ SentenceTransformer,
11
+ util,
12
+ )
13
+ from transformers import pipeline
14
 
15
  load_dotenv()
16
  AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
 
79
  return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
80
 
81
 
82
+ def predict_generation_model(text: str) -> tuple[str, float]:
83
  """
84
  Predicts if text is generated by gpt-4o or gpt-4o-mini models.
85
  Compare the input text against the paraphrased text by the models.
 
99
  if similarity > best_similarity:
100
  best_similarity = similarity
101
  best_model = model
102
+
103
  return best_model, best_similarity
104
 
105
 
 
130
  return paraphrased_text
131
  except OpenAIError as e: # Add exception handling
132
  print(f"Error in AI model inference: {e}")
133
+ return None
134
+
135
+
136
  def measure_text_similarity(text1: str, text2: str) -> float:
137
  """
138
  Measure the similarity between two texts.
 
157
  similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
158
  print(similarity[0][0])
159
  return similarity[0][0]
 
src/application/text/search_detection.py CHANGED
@@ -75,23 +75,26 @@ def find_paragraph_source(text, text_index, sentences_df):
75
  )
76
 
77
  if aligned_sentence["paraphrase"] is False:
78
- sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
79
- sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
 
 
 
 
80
  return sentences_df, []
81
-
82
  # assign values
83
  columns = [
84
  "input",
85
- "source",
86
- "label",
87
- "similarity",
88
- "paraphrase",
89
  "url",
90
- ]
91
  for c in columns:
92
  if c in sentences_df.columns:
93
  sentences_df.loc[text_index, c] = aligned_sentence[c]
94
-
95
 
96
  for idx, _ in sentences_df.iterrows():
97
  similarity = sentences_df.loc[idx, "similarity"]
@@ -106,12 +109,20 @@ def find_paragraph_source(text, text_index, sentences_df):
106
  url,
107
  )
108
 
109
- if similarity is None or \
110
- aligned_sentence["similarity"] > similarity:
111
- columns = ["input", "source", "label", "similarity", "url"]
112
- for c in columns:
113
- if c in sentences_df.columns:
114
- sentences_df.loc[idx, c] = aligned_sentence[c]
 
 
 
 
 
 
 
 
115
  return sentences_df, content.images
116
 
117
  sentences_df.loc[text_index, "input"] = text[text_index]
@@ -266,7 +277,7 @@ def check_paraphrase(input_text, page_text, url):
266
 
267
  label, is_paraphrased = determine_label(max_similarity)
268
  best_matched_paragraph = page_paragraphs[max_sim_index]
269
-
270
  alignment = {
271
  "input": paragraph,
272
  "source": best_matched_paragraph,
@@ -317,6 +328,7 @@ def check_human(alligned_sentences):
317
  return True
318
  return False
319
 
 
320
  def determine_label(similarity):
321
  if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
322
  return "HUMAN", True
 
75
  )
76
 
77
  if aligned_sentence["paraphrase"] is False:
78
+ sentences_df.loc[text_index, "input"] = aligned_sentence[
79
+ "input"
80
+ ]
81
+ sentences_df.loc[text_index, "paraphrase"] = (
82
+ aligned_sentence["paraphrase"]
83
+ )
84
  return sentences_df, []
85
+
86
  # assign values
87
  columns = [
88
  "input",
89
+ "source",
90
+ "label",
91
+ "similarity",
92
+ "paraphrase",
93
  "url",
94
+ ]
95
  for c in columns:
96
  if c in sentences_df.columns:
97
  sentences_df.loc[text_index, c] = aligned_sentence[c]
 
98
 
99
  for idx, _ in sentences_df.iterrows():
100
  similarity = sentences_df.loc[idx, "similarity"]
 
109
  url,
110
  )
111
 
112
+ if (
113
+ similarity is None
114
+ or aligned_sentence["similarity"] > similarity
115
+ ):
116
+ columns = [
117
+ "input",
118
+ "source",
119
+ "label",
120
+ "similarity",
121
+ "url",
122
+ ]
123
+ for c in columns:
124
+ if c in sentences_df.columns:
125
+ sentences_df.loc[idx, c] = aligned_sentence[c]
126
  return sentences_df, content.images
127
 
128
  sentences_df.loc[text_index, "input"] = text[text_index]
 
277
 
278
  label, is_paraphrased = determine_label(max_similarity)
279
  best_matched_paragraph = page_paragraphs[max_sim_index]
280
+
281
  alignment = {
282
  "input": paragraph,
283
  "source": best_matched_paragraph,
 
328
  return True
329
  return False
330
 
331
+
332
  def determine_label(similarity):
333
  if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
334
  return "HUMAN", True
test.py CHANGED
@@ -1,2 +1,2 @@
1
  my_list = [0, 0]
2
- print(my_list[-2])
 
1
  my_list = [0, 0]
2
+ print(my_list[-2])