pmkhanh7890 commited on
Commit
e58707f
·
1 Parent(s): b73a4fc
src/application/config.py CHANGED
@@ -88,4 +88,4 @@ ENTITY_BRIGHTNESS = 0.75 # color's brightness.
88
 
89
 
90
  # HTML formatting
91
- WORD_BREAK = "word-break: break-all;"
 
88
 
89
 
90
  # HTML formatting
91
+ WORD_BREAK = "word-break: break-all;"
src/application/content_detection.py CHANGED
@@ -5,19 +5,22 @@ Date: 2024-12-04
5
 
6
  import pandas as pd
7
 
8
- from src.application.config import MIN_RATIO_PARAPHRASE_NUM, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_MACHINE
9
- from src.application.formatting import color_text, format_entity_count
 
 
 
 
 
 
 
10
  from src.application.image.image_detection import (
11
  detect_image_by_ai_model,
12
  detect_image_by_reverse_search,
13
  detect_image_from_news_image,
14
  )
15
- from src.application.text.entity import (
16
- apply_highlight,
17
- highlight_entities,
18
- )
19
  from src.application.text.helper import (
20
- extract_equal_text,
21
  postprocess_label,
22
  split_into_paragraphs,
23
  )
@@ -26,6 +29,7 @@ from src.application.text.model_detection import (
26
  predict_generation_model,
27
  )
28
  from src.application.text.search_detection import find_sentence_source
 
29
 
30
 
31
  class NewsVerification:
@@ -38,12 +42,8 @@ class NewsVerification:
38
  self.news_content: str = ""
39
  self.news_image: str = ""
40
 
41
- self.text_prediction_label: list[str] = ["UNKNOWN"]
42
- self.text_prediction_score: list[float] = [0.0]
43
-
44
- self.image_prediction_label: list[str] = ["UNKNOWN"]
45
- self.image_prediction_score: list[str] = [0.0]
46
- self.image_referent_url: list[str] = []
47
 
48
  self.news_prediction_label: str = ""
49
  self.news_prediction_score: float = -1
@@ -63,12 +63,6 @@ class NewsVerification:
63
  # "entities",
64
  ],
65
  )
66
- self.grouped_url_df: pd.DataFrame = pd.DataFrame()
67
-
68
- # For formatting ouput tables
69
- self.ordinary_user_table: list = []
70
- self.fact_checker_table: list = []
71
- self.governor_table: list = []
72
 
73
  def load_news(self, news_title: str, news_content: str, news_image: str):
74
  """
@@ -111,7 +105,7 @@ class NewsVerification:
111
  ) # Handle mixed data types and NaNs
112
 
113
  # Group sentences by URL and concatenate 'input' and 'source' text.
114
- self.grouped_url_df = (
115
  self.aligned_sentences_df.groupby("url")
116
  .agg(
117
  {
@@ -123,8 +117,8 @@ class NewsVerification:
123
  ) # Reset index to make 'url' a regular column
124
 
125
  # Add new columns for label and score
126
- self.grouped_url_df["label"] = None
127
- self.grouped_url_df["score"] = None
128
 
129
  print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
130
 
@@ -132,7 +126,7 @@ class NewsVerification:
132
  """
133
  Determines the text origin for each URL group.
134
  """
135
- for index, row in self.grouped_url_df.iterrows():
136
  # Verify text origin using URL-based verification.
137
  label, score = self.verify_text(row["url"])
138
 
@@ -144,8 +138,8 @@ class NewsVerification:
144
  # Detect text origin using an AI model.
145
  label, score = detect_text_by_ai_model(text)
146
 
147
- self.grouped_url_df.at[index, "label"] = label
148
- self.grouped_url_df.at[index, "score"] = score
149
 
150
  def determine_text_origin(self):
151
  """
@@ -166,10 +160,10 @@ class NewsVerification:
166
  self.determine_text_origin_by_url()
167
 
168
  # Determine the overall label and score for the entire input text.
169
- if not self.grouped_url_df.empty:
170
  # Check for 'gpt-4o' labels in the grouped URLs.
171
- machine_label = self.grouped_url_df[
172
- self.grouped_url_df["label"].str.contains(
173
  "gpt-4o",
174
  case=False,
175
  na=False,
@@ -183,15 +177,15 @@ class NewsVerification:
183
 
184
  # labels = " and ".join(machine_label["label"].tolist())
185
  # label = remove_duplicate_words(label)
186
- self.text_prediction_label[0] = label
187
- self.text_prediction_score[0] = machine_label["score"].mean()
188
  else:
189
  # If no 'gpt-4o' labels, assign for 'HUMAN' labels.
190
  machine_label = self.aligned_sentences_df[
191
  self.aligned_sentences_df["label"] == "HUMAN"
192
  ]
193
- self.text_prediction_label[0] = "HUMAN"
194
- self.text_prediction_score[0] = machine_label["score"].mean()
195
  else:
196
  # If no found URLs, use AI detection on the entire input text.
197
  print("No source found in the input text")
@@ -199,34 +193,40 @@ class NewsVerification:
199
 
200
  # Detect text origin using an AI model.
201
  label, score = detect_text_by_ai_model(text)
202
- self.text_prediction_label[0] = label
203
- self.text_prediction_score[0] = score
204
 
205
  def find_text_source(self):
206
  """
207
  Determines the origin of the given text based on paraphrasing
208
  detection and human authorship analysis.
209
 
210
- 1. Splits the input news text into sentences,
211
  2. Searches for sources for each sentence
212
  3. Updates the aligned_sentences_df with the found sources.
213
  """
214
  print("CHECK TEXT:")
215
  print("\tFrom search engine:")
216
-
217
  input_paragraphs = split_into_paragraphs(self.news_text)
218
-
219
- # Initialize an empty DataFrame if it doesn't exist, otherwise extend it.
220
- if not hasattr(self, 'aligned_sentences_df') or self.aligned_sentences_df is None:
221
- self.aligned_sentences_df = pd.DataFrame(columns=[
222
- "input",
223
- "source",
224
- "label",
225
- "similarity",
226
- "paraphrase",
227
- "url",
228
- "entities",
229
- ])
 
 
 
 
 
 
230
 
231
  # Setup DataFrame for input_sentences
232
  for _ in range(len(input_paragraphs)):
@@ -265,19 +265,19 @@ class NewsVerification:
265
  index,
266
  self.aligned_sentences_df,
267
  )
268
-
269
  # Initialize found_img_url if it does not exist.
270
- if not hasattr(self, 'found_img_url'):
271
  self.found_img_url = []
272
  self.found_img_url.extend(img_urls)
273
 
274
  def verify_text(self, url):
275
  """
276
- Verifies the text origin based on similarity scores and labels
277
  associated with a given URL.
278
 
279
- 1. Filters sentences by URL and similarity score,
280
- 2. Determines if the text is likely generated by a machine or a human.
281
  3. Calculates an average similarity score.
282
 
283
  Args:
@@ -285,27 +285,30 @@ class NewsVerification:
285
 
286
  Returns:
287
  tuple: A
288
- - Label ("MACHINE", "HUMAN", or "UNKNOWN")
289
  - Score
290
  """
291
  label = "UNKNOWN"
292
  score = 0
293
-
294
  # calculate the average similarity when the similary score
295
  # in each row of sentences_df is higher than 0.8
296
-
297
  # Filter sentences by URL.
298
  filtered_by_url = self.aligned_sentences_df[
299
  self.aligned_sentences_df["url"] == url
300
  ]
301
-
302
  # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
303
  filtered_by_similarity = filtered_by_url[
304
  filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
305
  ]
306
-
307
  # Check if a ratio of remaining filtering-sentences is more than 50%.
308
- if len(filtered_by_similarity) / len(self.aligned_sentences_df) > MIN_RATIO_PARAPHRASE_NUM:
 
 
 
309
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
310
  contains_machine = (
311
  filtered_by_similarity["label"]
@@ -316,7 +319,7 @@ class NewsVerification:
316
  )
317
  .any()
318
  )
319
-
320
  # TODO: integrate with determine_text_origin
321
  if contains_machine:
322
  # If "MACHINE" label is present, set label and calculate score.
@@ -331,7 +334,8 @@ class NewsVerification:
331
  label = f"Partially generated by {generated_model}"
332
  score = machine_rows["similarity"].mean()
333
  else:
334
- # If no "MACHINE" label, assign "HUMAN" label and calculate score.
 
335
  label = "HUMAN"
336
  human_rows = filtered_by_similarity[
337
  filtered_by_similarity["label"].str.contains(
@@ -346,21 +350,21 @@ class NewsVerification:
346
 
347
  def determine_image_origin(self):
348
  """
349
- Determines the origin of the news image using various detection methods.
350
 
351
  1. Matching against previously found image URLs.
352
  2. Reverse image search.
353
  3. AI-based image detection.
354
 
355
- If none of these methods succeed, the image origin is marked as "UNKNOWN".
356
  """
357
  print("CHECK IMAGE:")
358
-
359
  # Handle the case where no image is provided.
360
  if self.news_image is None:
361
- self.image_prediction_label = "UNKNOWN"
362
- self.image_prediction_score = 0.0
363
- self.image_referent_url = None
364
  return
365
 
366
  # Attempt to match the image against previously found image URLs.
@@ -371,9 +375,9 @@ class NewsVerification:
371
  )
372
  if matched_url is not None:
373
  print(f"matched image: {matched_url}\nsimilarity: {similarity}\n")
374
- self.image_prediction_label = "HUMAN"
375
- self.image_prediction_score = similarity
376
- self.image_referent_url = matched_url
377
  return
378
 
379
  # Attempt to find the image origin using reverse image search.
@@ -383,9 +387,9 @@ class NewsVerification:
383
  )
384
  if matched_url is not None:
385
  print(f"matched image: {matched_url}\tScore: {similarity}%\n")
386
- self.image_prediction_label = "HUMAN"
387
- self.image_prediction_score = similarity
388
- self.image_referent_url = matched_url
389
  return
390
 
391
  # Attempt to detect the image origin using an AI model.
@@ -393,15 +397,15 @@ class NewsVerification:
393
  detected_label, score = detect_image_by_ai_model(self.news_image)
394
  if detected_label:
395
  print(f"detected_label: {detected_label} ({score})")
396
- self.image_prediction_label = detected_label
397
- self.image_prediction_score = score
398
- self.image_referent_url = None
399
  return
400
 
401
  # If all detection methods fail, mark the image origin as "UNKNOWN".
402
- self.image_prediction_label = "UNKNOWN"
403
- self.image_prediction_score = 50
404
- self.image_referent_url = None
405
 
406
  def determine_origin(self):
407
  """
@@ -411,13 +415,13 @@ class NewsVerification:
411
  self.determine_text_origin()
412
  if self.news_image != "":
413
  self.determine_image_origin()
414
-
415
  # Handle entity recognition and processing.
416
  self.handle_entities()
417
 
418
  def generate_report(self) -> tuple[str, str, str]:
419
  """
420
- Generates reports tailored for different user roles
421
  (ordinary users, fact checkers, governors).
422
 
423
  Returns:
@@ -426,9 +430,21 @@ class NewsVerification:
426
  - fact_checker_table: Report for fact checkers.
427
  - governor_table: Report for governors.
428
  """
429
- ordinary_user_table = self.create_ordinary_user_table()
430
- fact_checker_table = self.create_fact_checker_table()
431
- governor_table = self.create_governor_table()
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
  return ordinary_user_table, fact_checker_table, governor_table
434
 
@@ -436,22 +452,22 @@ class NewsVerification:
436
  """
437
  Highlights and assigns entities with colors to aligned sentences
438
  based on grouped URLs.
439
-
440
  For each grouped URL:
441
  1. Highlights entities in the input and source text
442
- 2. Then assigns these highlighted entities to the corresponding
443
  sentences in the aligned sentences DataFrame.
444
  """
445
-
446
  entities_with_colors = []
447
- for index, row in self.grouped_url_df.iterrows():
448
  # Get entity-words (in pair) with colors
449
  entities_with_colors = highlight_entities(
450
  row["input"],
451
  row["source"],
452
  )
453
 
454
- # Assign the highlighted entities to the corresponding sentences
455
  # in aligned_sentences_df.
456
  for index, sentence in self.aligned_sentences_df.iterrows():
457
  if sentence["url"] == row["url"]:
@@ -468,440 +484,3 @@ class NewsVerification:
468
  set: A set containing the unique URLs referenced in the text.
469
  """
470
  return set(self.text_referent_url)
471
-
472
- def create_fact_checker_table(self):
473
- rows = []
474
- rows.append(self.format_image_fact_checker_row())
475
-
476
- for _, row in self.aligned_sentences_df.iterrows():
477
- if row["input"] is None:
478
- continue
479
-
480
- if row["source"] is None:
481
- equal_idx_1 = equal_idx_2 = []
482
-
483
- else: # Get index of equal phrases in input and source sentences
484
- equal_idx_1, equal_idx_2 = extract_equal_text(
485
- row["input"],
486
- row["source"],
487
- )
488
-
489
- self.fact_checker_table.append(
490
- [
491
- row,
492
- equal_idx_1,
493
- equal_idx_2,
494
- row["entities"],
495
- row["url"],
496
- ],
497
- )
498
-
499
- previous_url = None
500
- span_row = 1
501
- for index, row in enumerate(self.fact_checker_table):
502
- current_url = row[4]
503
- last_url_row = False
504
-
505
- # First row or URL change
506
- if index == 0 or current_url != previous_url:
507
- first_url_row = True
508
- previous_url = current_url
509
- # Increase counter "span_row" when the next url is the same
510
- while (
511
- index + span_row < len(self.fact_checker_table)
512
- and self.fact_checker_table[index + span_row][4]
513
- == current_url
514
- ):
515
- span_row += 1
516
-
517
- else:
518
- first_url_row = False
519
- span_row -= 1
520
-
521
- if span_row == 1:
522
- last_url_row = True
523
-
524
- formatted_row = self.format_text_fact_checker_row(
525
- row,
526
- first_url_row,
527
- last_url_row,
528
- span_row,
529
- )
530
- rows.append(formatted_row)
531
-
532
- table = "\n".join(rows)
533
- return f"""
534
- <h5>Comparison between input news and source news:</h5>
535
- <table border="1" style="width:100%; text-align:left;">
536
- <col style="width: 170px;">
537
- <col style="width: 170px;">
538
- <col style="width: 30px;">
539
- <col style="width: 75px;">
540
- <thead>
541
- <tr>
542
- <th>Input news</th>
543
- <th>Source (URL in Originality)</th>
544
- <th>Forensic</th>
545
- <th>Originality</th>
546
- </tr>
547
- </thead>
548
- <tbody>
549
- {table}
550
- </tbody>
551
- </table>
552
-
553
- <style>
554
- """
555
-
556
- def format_text_fact_checker_row(
557
- self,
558
- row,
559
- first_url_row=True,
560
- last_url_row=True,
561
- span_row=1,
562
- ):
563
- entity_count = 0
564
- if row[0]["input"] is None:
565
- return ""
566
- if row[0]["source"] is not None: # source is not empty
567
- if row[3] is not None:
568
- # highlight entities
569
- input_sentence, highlight_idx_input = apply_highlight(
570
- row[0]["input"],
571
- row[3],
572
- "input",
573
- )
574
- source_sentence, highlight_idx_source = apply_highlight(
575
- row[0]["source"],
576
- row[3],
577
- "source",
578
- )
579
- else:
580
- input_sentence = row[0]["input"]
581
- source_sentence = row[0]["source"]
582
- highlight_idx_input = []
583
- highlight_idx_source = []
584
-
585
- if row[3] is not None:
586
- entity_count = len(row[3])
587
-
588
- # Color overlapping words
589
- input_sentence = color_text(
590
- input_sentence,
591
- row[1],
592
- highlight_idx_input,
593
- ) # text, index of highlight words
594
- source_sentence = color_text(
595
- source_sentence,
596
- row[2],
597
- highlight_idx_source,
598
- ) # text, index of highlight words
599
-
600
- # Replace _ to get correct formatting
601
- # Original one having _ for correct word counting
602
- input_sentence = input_sentence.replace(
603
- "span_style",
604
- "span style",
605
- ).replace("1px_4px", "1px 4px")
606
- source_sentence = source_sentence.replace(
607
- "span_style",
608
- "span style",
609
- ).replace("1px_4px", "1px 4px")
610
- else:
611
- input_sentence = row[0]["input"]
612
- source_sentence = row[0]["source"]
613
-
614
- url = row[0]["url"]
615
-
616
- # Displayed label and score by url
617
- filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
618
- if len(filterby_url) > 0:
619
- label = filterby_url["label"].values[0]
620
- score = filterby_url["score"].values[0]
621
- else:
622
- label = self.text_prediction_label[0]
623
- score = self.text_prediction_score[0]
624
-
625
- # Format displayed url
626
- source_text_url = f"""<a href="{url}">{url}</a>"""
627
-
628
- # Format displayed entity count
629
- entity_count_text = format_entity_count(entity_count)
630
-
631
- border_top = "border-top: 1px solid transparent;"
632
- border_bottom = "border-bottom: 1px solid transparent;"
633
- word_break = "word-break: break-all;"
634
- if first_url_row is True:
635
- # First & Last the group: no transparent
636
- if last_url_row is True:
637
- return f"""
638
- <tr>
639
- <td>{input_sentence}</td>
640
- <td>{source_sentence}</td>
641
- <td rowspan="{span_row}">{label}<br>
642
- ({score * 100:.2f}%)<br><br>
643
- {entity_count_text}</td>
644
- <td rowspan="{span_row}"; style="{word_break}";>{source_text_url}</td>
645
- </tr>
646
- """
647
- # First row of the group: transparent bottom border
648
- return f"""
649
- <tr>
650
- <td style="{border_bottom}";>{input_sentence}</td>
651
- <td style="{border_bottom}";>{source_sentence}</td>
652
- <td rowspan="{span_row}">{label}<br>
653
- ({score * 100:.2f}%)<br><br>
654
- {entity_count_text}</td>
655
- <td rowspan="{span_row}"; style="{word_break}";>{source_text_url}</td>
656
- </tr>
657
- """
658
- else:
659
- if last_url_row is True:
660
- # NOT First row, Last row: transparent top border
661
- return f"""
662
- <tr>
663
- <td style="{border_top}";>{input_sentence}</td>
664
- <td style="{border_top}";>{source_sentence}</td>
665
- </tr>
666
- """
667
- else:
668
- # NOT First & NOT Last row: transparent top & bottom borders
669
- return f"""
670
- <tr>
671
- <td style="{border_top} {border_bottom}";>{input_sentence}</td>
672
- <td style="{border_top} {border_bottom}";>{source_sentence}</td>
673
- </tr>
674
- """
675
-
676
- def format_image_fact_checker_row(self):
677
-
678
- if (
679
- self.image_referent_url is not None
680
- or self.image_referent_url != ""
681
- ):
682
- source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">""" # noqa: E501
683
- source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>""" # noqa: E501
684
- else:
685
- source_image = "Image not found"
686
- source_image_url = ""
687
-
688
- word_break = "word-break: break-all;"
689
- return f"""
690
- <tr>
691
- <td>input image</td>
692
- <td>{source_image}</td>
693
- <td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td>
694
- <td style="{word_break}";>{source_image_url}</td></tr>"""
695
-
696
- def create_ordinary_user_table(self):
697
- rows = []
698
- rows.append(self.format_image_ordinary_user_row())
699
- rows.append(self.format_text_ordinary_user_row())
700
- table = "\n".join(rows)
701
-
702
- return f"""
703
- <h5>Comparison between input news and source news:</h5>
704
- <table border="1" style="width:100%; text-align:left;">
705
- <col style="width: 340px;">
706
- <col style="width: 30px;">
707
- <col style="width: 75px;">
708
- <thead>
709
- <tr>
710
- <th>Input news</th>
711
- <th>Forensic</th>
712
- <th>Originality</th>
713
- </tr>
714
- </thead>
715
- <tbody>
716
- {table}
717
- </tbody>
718
- </table>
719
-
720
- <style>
721
- """
722
-
723
- def format_text_ordinary_user_row(self):
724
- input_sentences = ""
725
- source_text_urls = ""
726
- urls = []
727
- for _, row in self.aligned_sentences_df.iterrows():
728
- if row["input"] is None:
729
- continue
730
- input_sentences += row["input"] + "<br><br>"
731
- url = row["url"]
732
- if url not in urls:
733
- urls.append(url)
734
- source_text_urls += f"""<a href="{url}">{url}</a><br>"""
735
-
736
- word_break = "word-break: break-all;"
737
- return f"""
738
- <tr>
739
- <td>{input_sentences}</td>
740
- <td>{self.text_prediction_label[0]}<br>
741
- ({self.text_prediction_score[0] * 100:.2f}%)</td>
742
- <td style="{word_break}";>{source_text_urls}</td>
743
- </tr>
744
- """
745
-
746
- def format_image_ordinary_user_row(self):
747
-
748
- if (
749
- self.image_referent_url is not None
750
- or self.image_referent_url != ""
751
- ):
752
- source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>""" # noqa: E501
753
- else:
754
- source_image_url = ""
755
-
756
- word_break = "word-break: break-all;"
757
- return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501
758
-
759
- def create_governor_table(self):
760
- rows = []
761
- rows.append(self.format_image_governor_row())
762
-
763
- for _, row in self.aligned_sentences_df.iterrows():
764
- if row["input"] is None:
765
- continue
766
-
767
- if row["source"] is None:
768
- equal_idx_1 = equal_idx_2 = []
769
- else:
770
- # Get index of equal phrases in input and source sentences
771
- equal_idx_1, equal_idx_2 = extract_equal_text(
772
- row["input"],
773
- row["source"],
774
- )
775
-
776
- self.governor_table.append(
777
- [
778
- row,
779
- equal_idx_1,
780
- equal_idx_2,
781
- row["entities"],
782
- ],
783
- )
784
-
785
- formatted_row = self.format_text_governor_row()
786
- rows.append(formatted_row)
787
-
788
- table = "\n".join(rows)
789
- return f"""
790
- <h5>Comparison between input news and source news:</h5>
791
- <table border="1" style="width:100%; text-align:left;">
792
- <col style="width: 170px;">
793
- <col style="width: 170px;">
794
- <col style="width: 30px;">
795
- <col style="width: 75px;">
796
- <thead>
797
- <tr>
798
- <th>Input news</th>
799
- <th>Source (URL in Originality)</th>
800
- <th>Forensic</th>
801
- <th>Originality</th>
802
- </tr>
803
- </thead>
804
- <tbody>
805
- {table}
806
- </tbody>
807
- </table>
808
-
809
- <style>
810
- """
811
-
812
- def format_text_governor_row(self):
813
- input_sentences = ""
814
- source_sentences = ""
815
- source_text_urls = ""
816
- urls = []
817
- sentence_count = 0
818
- entity_count = [0, 0] # to get index of [-2]
819
- for row in self.governor_table:
820
- if row[0]["input"] is None:
821
- continue
822
-
823
- if row[0]["source"] is not None: # source is not empty
824
- # highlight entities
825
- input_sentence, highlight_idx_input = apply_highlight(
826
- row[0]["input"],
827
- row[3], # entities_with_colors
828
- "input", # key
829
- entity_count[
830
- -2
831
- ], # since the last one is for current counting
832
- )
833
- source_sentence, highlight_idx_source = apply_highlight(
834
- row[0]["source"],
835
- row[3], # entities_with_colors
836
- "source", # key
837
- entity_count[
838
- -2
839
- ], # since the last one is for current counting
840
- )
841
-
842
- # Color overlapping words
843
- input_sentence = color_text(
844
- input_sentence,
845
- row[1],
846
- highlight_idx_input,
847
- ) # text, index of highlight words
848
- source_sentence = color_text(
849
- source_sentence,
850
- row[2],
851
- highlight_idx_source,
852
- ) # text, index of highlight words
853
-
854
- input_sentence = input_sentence.replace(
855
- "span_style",
856
- "span style",
857
- ).replace("1px_4px", "1px 4px")
858
- source_sentence = source_sentence.replace(
859
- "span_style",
860
- "span style",
861
- ).replace("1px_4px", "1px 4px")
862
-
863
- else:
864
- if row[0]["source"] is None:
865
- source_sentence = ""
866
- else:
867
- source_sentence = row[0]["source"]
868
- input_sentence = row[0]["input"]
869
-
870
- # convert score to HUMAN-based score:
871
- input_sentences += input_sentence + "<br><br>"
872
- source_sentences += source_sentence + "<br><br>"
873
-
874
- url = row[0]["url"]
875
- if url not in urls:
876
- urls.append(url)
877
- source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
878
- sentence_count += 1
879
- if row[3] is not None:
880
- entity_count.append(len(row[3]))
881
-
882
- entity_count_text = format_entity_count(sum(entity_count))
883
- word_break = "word-break: break-all;"
884
- return f"""
885
- <tr>
886
- <td>{input_sentences}</td>
887
- <td>{source_sentences}</td>
888
- <td>{self.text_prediction_label[0]}<br>
889
- ({self.text_prediction_score[0] * 100:.2f}%)<br><br>
890
- {entity_count_text}</td>
891
- <td style="{word_break}";>{source_text_urls}</td>
892
- </tr>
893
- """
894
-
895
- def format_image_governor_row(self):
896
- if (
897
- self.image_referent_url is not None
898
- or self.image_referent_url != ""
899
- ):
900
- source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">""" # noqa: E501
901
- source_image_url = f"""<a href="{self.image_referent_url}">{self.image_referent_url}</a>""" # noqa: E501
902
- else:
903
- source_image = "Image not found"
904
- source_image_url = ""
905
-
906
- word_break = "word-break: break-all;"
907
- return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501
 
5
 
6
  import pandas as pd
7
 
8
+ from src.application.config import (
9
+ MIN_RATIO_PARAPHRASE_NUM,
10
+ PARAPHRASE_THRESHOLD,
11
+ PARAPHRASE_THRESHOLD_MACHINE,
12
+ )
13
+ from src.application.formatting_fact_checker import create_fact_checker_table
14
+ from src.application.formatting_governor import create_governor_table
15
+ from src.application.formatting_ordinary_user import create_ordinary_user_table
16
+ from src.application.image.image import ImageDetector
17
  from src.application.image.image_detection import (
18
  detect_image_by_ai_model,
19
  detect_image_by_reverse_search,
20
  detect_image_from_news_image,
21
  )
22
+ from src.application.text.entity import highlight_entities
 
 
 
23
  from src.application.text.helper import (
 
24
  postprocess_label,
25
  split_into_paragraphs,
26
  )
 
29
  predict_generation_model,
30
  )
31
  from src.application.text.search_detection import find_sentence_source
32
+ from src.application.text.text import TextDetector
33
 
34
 
35
  class NewsVerification:
 
42
  self.news_content: str = ""
43
  self.news_image: str = ""
44
 
45
+ self.text = TextDetector()
46
+ self.image = ImageDetector()
 
 
 
 
47
 
48
  self.news_prediction_label: str = ""
49
  self.news_prediction_score: float = -1
 
63
  # "entities",
64
  ],
65
  )
 
 
 
 
 
 
66
 
67
  def load_news(self, news_title: str, news_content: str, news_image: str):
68
  """
 
105
  ) # Handle mixed data types and NaNs
106
 
107
  # Group sentences by URL and concatenate 'input' and 'source' text.
108
+ self.text.grouped_url_df = (
109
  self.aligned_sentences_df.groupby("url")
110
  .agg(
111
  {
 
117
  ) # Reset index to make 'url' a regular column
118
 
119
  # Add new columns for label and score
120
+ self.text.grouped_url_df["label"] = None
121
+ self.text.grouped_url_df["score"] = None
122
 
123
  print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
124
 
 
126
  """
127
  Determines the text origin for each URL group.
128
  """
129
+ for index, row in self.text.grouped_url_df.iterrows():
130
  # Verify text origin using URL-based verification.
131
  label, score = self.verify_text(row["url"])
132
 
 
138
  # Detect text origin using an AI model.
139
  label, score = detect_text_by_ai_model(text)
140
 
141
+ self.text.grouped_url_df.at[index, "label"] = label
142
+ self.text.grouped_url_df.at[index, "score"] = score
143
 
144
  def determine_text_origin(self):
145
  """
 
160
  self.determine_text_origin_by_url()
161
 
162
  # Determine the overall label and score for the entire input text.
163
+ if not self.text.grouped_url_df.empty:
164
  # Check for 'gpt-4o' labels in the grouped URLs.
165
+ machine_label = self.text.grouped_url_df[
166
+ self.text.grouped_url_df["label"].str.contains(
167
  "gpt-4o",
168
  case=False,
169
  na=False,
 
177
 
178
  # labels = " and ".join(machine_label["label"].tolist())
179
  # label = remove_duplicate_words(label)
180
+ self.text.prediction_label[0] = label
181
+ self.text.prediction_score[0] = machine_label["score"].mean()
182
  else:
183
  # If no 'gpt-4o' labels, assign for 'HUMAN' labels.
184
  machine_label = self.aligned_sentences_df[
185
  self.aligned_sentences_df["label"] == "HUMAN"
186
  ]
187
+ self.text.prediction_label[0] = "HUMAN"
188
+ self.text.prediction_score[0] = machine_label["score"].mean()
189
  else:
190
  # If no found URLs, use AI detection on the entire input text.
191
  print("No source found in the input text")
 
193
 
194
  # Detect text origin using an AI model.
195
  label, score = detect_text_by_ai_model(text)
196
+ self.text.prediction_label[0] = label
197
+ self.text.prediction_score[0] = score
198
 
199
  def find_text_source(self):
200
  """
201
  Determines the origin of the given text based on paraphrasing
202
  detection and human authorship analysis.
203
 
204
+ 1. Splits the input news text into sentences,
205
  2. Searches for sources for each sentence
206
  3. Updates the aligned_sentences_df with the found sources.
207
  """
208
  print("CHECK TEXT:")
209
  print("\tFrom search engine:")
210
+
211
  input_paragraphs = split_into_paragraphs(self.news_text)
212
+
213
+ # Initialize an empty DataFrame if it doesn't exist,
214
+ # otherwise extend it.
215
+ if (
216
+ not hasattr(self, "aligned_sentences_df")
217
+ or self.aligned_sentences_df is None
218
+ ):
219
+ self.aligned_sentences_df = pd.DataFrame(
220
+ columns=[
221
+ "input",
222
+ "source",
223
+ "label",
224
+ "similarity",
225
+ "paraphrase",
226
+ "url",
227
+ "entities",
228
+ ],
229
+ )
230
 
231
  # Setup DataFrame for input_sentences
232
  for _ in range(len(input_paragraphs)):
 
265
  index,
266
  self.aligned_sentences_df,
267
  )
268
+
269
  # Initialize found_img_url if it does not exist.
270
+ if not hasattr(self, "found_img_url"):
271
  self.found_img_url = []
272
  self.found_img_url.extend(img_urls)
273
 
274
  def verify_text(self, url):
275
  """
276
+ Verifies the text origin based on similarity scores and labels
277
  associated with a given URL.
278
 
279
+ 1. Filters sentences by URL and similarity score,
280
+ 2. Determines if the text is likely generated by a machine or a human.
281
  3. Calculates an average similarity score.
282
 
283
  Args:
 
285
 
286
  Returns:
287
  tuple: A
288
+ - Label ("MACHINE", "HUMAN", or "UNKNOWN")
289
  - Score
290
  """
291
  label = "UNKNOWN"
292
  score = 0
293
+
294
  # calculate the average similarity when the similary score
295
  # in each row of sentences_df is higher than 0.8
296
+
297
  # Filter sentences by URL.
298
  filtered_by_url = self.aligned_sentences_df[
299
  self.aligned_sentences_df["url"] == url
300
  ]
301
+
302
  # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
303
  filtered_by_similarity = filtered_by_url[
304
  filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
305
  ]
306
+
307
  # Check if a ratio of remaining filtering-sentences is more than 50%.
308
+ if (
309
+ len(filtered_by_similarity) / len(self.aligned_sentences_df)
310
+ > MIN_RATIO_PARAPHRASE_NUM
311
+ ):
312
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
313
  contains_machine = (
314
  filtered_by_similarity["label"]
 
319
  )
320
  .any()
321
  )
322
+
323
  # TODO: integrate with determine_text_origin
324
  if contains_machine:
325
  # If "MACHINE" label is present, set label and calculate score.
 
334
  label = f"Partially generated by {generated_model}"
335
  score = machine_rows["similarity"].mean()
336
  else:
337
+ # If no "MACHINE" label,
338
+ # assign "HUMAN" label and calculate score.
339
  label = "HUMAN"
340
  human_rows = filtered_by_similarity[
341
  filtered_by_similarity["label"].str.contains(
 
350
 
351
  def determine_image_origin(self):
352
  """
353
+ Determines the origin of the news image using 3 detection methods.
354
 
355
  1. Matching against previously found image URLs.
356
  2. Reverse image search.
357
  3. AI-based image detection.
358
 
359
+ If none of these methods succeed, the image origin is "UNKNOWN".
360
  """
361
  print("CHECK IMAGE:")
362
+
363
  # Handle the case where no image is provided.
364
  if self.news_image is None:
365
+ self.image.prediction_label = "UNKNOWN"
366
+ self.image.prediction_score = 0.0
367
+ self.image.referent_url = None
368
  return
369
 
370
  # Attempt to match the image against previously found image URLs.
 
375
  )
376
  if matched_url is not None:
377
  print(f"matched image: {matched_url}\nsimilarity: {similarity}\n")
378
+ self.image.prediction_label = "HUMAN"
379
+ self.image.prediction_score = similarity
380
+ self.image.referent_url = matched_url
381
  return
382
 
383
  # Attempt to find the image origin using reverse image search.
 
387
  )
388
  if matched_url is not None:
389
  print(f"matched image: {matched_url}\tScore: {similarity}%\n")
390
+ self.image.prediction_label = "HUMAN"
391
+ self.image.prediction_score = similarity
392
+ self.image.referent_url = matched_url
393
  return
394
 
395
  # Attempt to detect the image origin using an AI model.
 
397
  detected_label, score = detect_image_by_ai_model(self.news_image)
398
  if detected_label:
399
  print(f"detected_label: {detected_label} ({score})")
400
+ self.image.prediction_label = detected_label
401
+ self.image.prediction_score = score
402
+ self.image.referent_url = None
403
  return
404
 
405
  # If all detection methods fail, mark the image origin as "UNKNOWN".
406
+ self.image.prediction_label = "UNKNOWN"
407
+ self.image.prediction_score = 50
408
+ self.image.referent_url = None
409
 
410
  def determine_origin(self):
411
  """
 
415
  self.determine_text_origin()
416
  if self.news_image != "":
417
  self.determine_image_origin()
418
+
419
  # Handle entity recognition and processing.
420
  self.handle_entities()
421
 
422
  def generate_report(self) -> tuple[str, str, str]:
423
  """
424
+ Generates reports tailored for different user roles
425
  (ordinary users, fact checkers, governors).
426
 
427
  Returns:
 
430
  - fact_checker_table: Report for fact checkers.
431
  - governor_table: Report for governors.
432
  """
433
+ ordinary_user_table = create_ordinary_user_table(
434
+ self.aligned_sentences_df,
435
+ self.text,
436
+ self.image,
437
+ )
438
+ fact_checker_table = create_fact_checker_table(
439
+ self.aligned_sentences_df,
440
+ self.text,
441
+ self.image,
442
+ )
443
+ governor_table = create_governor_table(
444
+ self.aligned_sentences_df,
445
+ self.text,
446
+ self.image,
447
+ )
448
 
449
  return ordinary_user_table, fact_checker_table, governor_table
450
 
 
452
  """
453
  Highlights and assigns entities with colors to aligned sentences
454
  based on grouped URLs.
455
+
456
  For each grouped URL:
457
  1. Highlights entities in the input and source text
458
+ 2. Then assigns these highlighted entities to the corresponding
459
  sentences in the aligned sentences DataFrame.
460
  """
461
+
462
  entities_with_colors = []
463
+ for index, row in self.text.grouped_url_df.iterrows():
464
  # Get entity-words (in pair) with colors
465
  entities_with_colors = highlight_entities(
466
  row["input"],
467
  row["source"],
468
  )
469
 
470
+ # Assign the highlighted entities to the corresponding sentences
471
  # in aligned_sentences_df.
472
  for index, sentence in self.aligned_sentences_df.iterrows():
473
  if sentence["url"] == row["url"]:
 
484
  set: A set containing the unique URLs referenced in the text.
485
  """
486
  return set(self.text_referent_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/application/content_generation.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
 
3
  import openai
4
  import pandas as pd
@@ -100,7 +101,7 @@ def extract_title_content(fake_news: str) -> tuple[str, str]:
100
  def generate_fake_image(
101
  title: str,
102
  model: str = GPT_IMAGE_MODEL,
103
- ) -> str | None:
104
  """
105
  Generates a fake image URL using Azure OpenAI's image generation API.
106
 
 
1
  import json
2
+ from typing import Optional
3
 
4
  import openai
5
  import pandas as pd
 
101
  def generate_fake_image(
102
  title: str,
103
  model: str = GPT_IMAGE_MODEL,
104
+ ) -> Optional[str]:
105
  """
106
  Generates a fake image URL using Azure OpenAI's image generation API.
107
 
src/application/formatting.py CHANGED
@@ -1,18 +1,26 @@
1
- from src.application.text.helper import extract_starts_ends, filter_indices
 
 
 
2
 
3
 
4
- def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -> str:
 
 
 
 
5
  """
6
  Colors specific words in a text based on provided indices.
7
 
8
- This method takes a text, a list of indices to color, and a list of indices to exclude.
9
- It splits the text into words, filters the indices, and then wraps the words within
10
- the specified ranges with a green span tag for coloring.
11
 
12
  Args:
13
  text (str): The input text.
14
- colored_idx (list): A list of dictionaries, where each dictionary contains
15
- 'start' and 'end' keys representing indices of words to color.
 
16
  highlighted_idx (list): A list of indices to exclude from coloring.
17
 
18
  Returns:
@@ -23,7 +31,7 @@ def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -
23
 
24
  # Extract start and end indices from colored_idx.
25
  starts, ends = extract_starts_ends(colored_idx)
26
-
27
  # Filter the start and end indices to exclude highlighted_idx.
28
  starts, ends = filter_indices(starts, ends, highlighted_idx)
29
 
@@ -64,4 +72,4 @@ def format_entity_count(entity_count: int) -> str:
64
  entity_count_text = "with 1 altered entity"
65
  else:
66
  entity_count_text = "with altered entities"
67
- return entity_count_text
 
1
+ from src.application.text.helper import (
2
+ extract_starts_ends,
3
+ filter_indices,
4
+ )
5
 
6
 
7
+ def color_text(
8
+ text: str,
9
+ colored_idx: list[dict],
10
+ highlighted_idx: list[int],
11
+ ) -> str:
12
  """
13
  Colors specific words in a text based on provided indices.
14
 
15
+ 1. splits the text into words
16
+ 2. filters the indices
17
+ 3. wraps the words within the specified ranges with a coloring tag
18
 
19
  Args:
20
  text (str): The input text.
21
+ colored_idx (list): A list of dictionaries,
22
+ where each dictionary contains
23
+ 'start' and 'end' keys representing indices of words to color.
24
  highlighted_idx (list): A list of indices to exclude from coloring.
25
 
26
  Returns:
 
31
 
32
  # Extract start and end indices from colored_idx.
33
  starts, ends = extract_starts_ends(colored_idx)
34
+
35
  # Filter the start and end indices to exclude highlighted_idx.
36
  starts, ends = filter_indices(starts, ends, highlighted_idx)
37
 
 
72
  entity_count_text = "with 1 altered entity"
73
  else:
74
  entity_count_text = "with altered entities"
75
+ return entity_count_text
src/application/formatting_fact_checker.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pandas import DataFrame
2
+
3
+ from src.application.config import WORD_BREAK
4
+ from src.application.formatting import (
5
+ color_text,
6
+ format_entity_count,
7
+ )
8
+ from src.application.image.image import ImageDetector
9
+ from src.application.text.entity import apply_highlight
10
+ from src.application.text.helper import extract_equal_text
11
+ from src.application.text.text import TextDetector
12
+
13
+
14
+ def create_fact_checker_table(
15
+ aligned_sentences_df: DataFrame,
16
+ text: TextDetector,
17
+ image: ImageDetector,
18
+ ):
19
+ rows = []
20
+ rows.append(format_image_fact_checker_row(image))
21
+
22
+ for _, row in aligned_sentences_df.iterrows():
23
+ if row["input"] is None:
24
+ continue
25
+
26
+ if row["source"] is None:
27
+ equal_idx_1 = equal_idx_2 = []
28
+
29
+ else: # Get index of equal phrases in input and source sentences
30
+ equal_idx_1, equal_idx_2 = extract_equal_text(
31
+ row["input"],
32
+ row["source"],
33
+ )
34
+
35
+ text.fact_checker_table.append(
36
+ [
37
+ row, # aligned_sentences_df
38
+ equal_idx_1, # index of equal text in input
39
+ equal_idx_2, # index of equal text in source
40
+ row["entities"],
41
+ row["url"],
42
+ ],
43
+ )
44
+
45
+ previous_url = None
46
+ span_row = 1
47
+ for index, row in enumerate(text.fact_checker_table):
48
+ current_url = row[4]
49
+ last_url_row = False
50
+
51
+ # First row or URL change
52
+ if index == 0 or current_url != previous_url:
53
+ first_url_row = True
54
+ previous_url = current_url
55
+ # Increase counter "span_row" when the next url is the same
56
+ while (
57
+ index + span_row < len(text.fact_checker_table)
58
+ and text.fact_checker_table[index + span_row][4] == current_url
59
+ ):
60
+ span_row += 1
61
+
62
+ else:
63
+ first_url_row = False
64
+ span_row -= 1
65
+
66
+ if span_row == 1:
67
+ last_url_row = True
68
+
69
+ formatted_row = format_text_fact_checker_row(
70
+ text,
71
+ row,
72
+ first_url_row,
73
+ last_url_row,
74
+ span_row,
75
+ )
76
+ rows.append(formatted_row)
77
+
78
+ table = "\n".join(rows)
79
+ return f"""
80
+ <h5>Comparison between input news and source news:</h5>
81
+ <table border="1" style="width:100%; text-align:left;">
82
+ <col style="width: 170px;">
83
+ <col style="width: 170px;">
84
+ <col style="width: 30px;">
85
+ <col style="width: 75px;">
86
+ <thead>
87
+ <tr>
88
+ <th>Input news</th>
89
+ <th>Source (URL in Originality)</th>
90
+ <th>Forensic</th>
91
+ <th>Originality</th>
92
+ </tr>
93
+ </thead>
94
+ <tbody>
95
+ {table}
96
+ </tbody>
97
+ </table>
98
+ <style>
99
+ """
100
+
101
+
102
+ def format_text_fact_checker_row(
103
+ text: TextDetector,
104
+ row: list,
105
+ first_url_row: bool=True,
106
+ last_url_row: bool=True,
107
+ span_row: int=1,
108
+ ):
109
+ entity_count = 0
110
+ print(f"row: {row}")
111
+ if row[0]["input"] is None:
112
+ return ""
113
+ if row[0]["source"] is not None: # source is not empty
114
+ if row[3] is not None:
115
+ # highlight entities
116
+ input_sentence, highlight_idx_input = apply_highlight(
117
+ row[0]["input"],
118
+ row[3],
119
+ "input",
120
+ )
121
+ source_sentence, highlight_idx_source = apply_highlight(
122
+ row[0]["source"],
123
+ row[3],
124
+ "source",
125
+ )
126
+ else:
127
+ input_sentence = row[0]["input"]
128
+ source_sentence = row[0]["source"]
129
+ highlight_idx_input = []
130
+ highlight_idx_source = []
131
+
132
+ if row[3] is not None:
133
+ entity_count = len(row[3])
134
+
135
+ # Color overlapping words
136
+ input_sentence = color_text(
137
+ input_sentence,
138
+ row[1],
139
+ highlight_idx_input,
140
+ ) # text, index of highlight words
141
+ source_sentence = color_text(
142
+ source_sentence,
143
+ row[2],
144
+ highlight_idx_source,
145
+ ) # text, index of highlight words
146
+
147
+ # Replace _ to get correct formatting
148
+ # Original one having _ for correct word counting
149
+ input_sentence = input_sentence.replace(
150
+ "span_style",
151
+ "span style",
152
+ ).replace("1px_4px", "1px 4px")
153
+ source_sentence = source_sentence.replace(
154
+ "span_style",
155
+ "span style",
156
+ ).replace("1px_4px", "1px 4px")
157
+ else:
158
+ input_sentence = row[0]["input"]
159
+ source_sentence = row[0]["source"]
160
+
161
+ url = row[0]["url"]
162
+
163
+ # Displayed label and score by url
164
+ filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
165
+ if len(filterby_url) > 0:
166
+ label = filterby_url["label"].values[0]
167
+ score = filterby_url["score"].values[0]
168
+ else:
169
+ label = text.prediction_label[0]
170
+ score = text.prediction_score[0]
171
+
172
+ # Format displayed url
173
+ source_text_url = f"""<a href="{url}">{url}</a>"""
174
+
175
+ # Format displayed entity count
176
+ entity_count_text = format_entity_count(entity_count)
177
+
178
+ border_top = "border-top: 1px solid transparent;"
179
+ border_bottom = "border-bottom: 1px solid transparent;"
180
+ if first_url_row is True:
181
+ # First & Last the group: no transparent
182
+ if last_url_row is True:
183
+ return f"""
184
+ <tr>
185
+ <td>{input_sentence}</td>
186
+ <td>{source_sentence}</td>
187
+ <td rowspan="{span_row}">{label}<br>
188
+ ({score * 100:.2f}%)<br><br>
189
+ {entity_count_text}</td>
190
+ <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
191
+ </tr>
192
+ """
193
+ # First row of the group: transparent bottom border
194
+ return f"""
195
+ <tr>
196
+ <td style="{border_bottom}";>{input_sentence}</td>
197
+ <td style="{border_bottom}";>{source_sentence}</td>
198
+ <td rowspan="{span_row}">{label}<br>
199
+ ({score * 100:.2f}%)<br><br>
200
+ {entity_count_text}</td>
201
+ <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
202
+ </tr>
203
+ """
204
+ else:
205
+ if last_url_row is True:
206
+ # NOT First row, Last row: transparent top border
207
+ return f"""
208
+ <tr>
209
+ <td style="{border_top}";>{input_sentence}</td>
210
+ <td style="{border_top}";>{source_sentence}</td>
211
+ </tr>
212
+ """
213
+ else:
214
+ # NOT First & NOT Last row: transparent top & bottom borders
215
+ return f"""
216
+ <tr>
217
+ <td style="{border_top} {border_bottom}";>{input_sentence}</td>
218
+ <td style="{border_top} {border_bottom}";>{source_sentence}</td>
219
+ </tr>
220
+ """
221
+
222
+
223
+ def format_image_fact_checker_row(image):
224
+ if image.referent_url is not None or image.referent_url != "":
225
+ source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
226
+ source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
227
+ else:
228
+ source_image = "Image not found"
229
+ source_image_url = ""
230
+
231
+ return f"""
232
+ <tr>
233
+ <td>input image</td>
234
+ <td>{source_image}</td>
235
+ <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
236
+ <td style="{WORD_BREAK}";>{source_image_url}</td>
237
+ </tr>
238
+ """
src/application/formatting_governor.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pandas import DataFrame
2
+
3
+ from src.application.config import WORD_BREAK
4
+ from src.application.formatting import (
5
+ color_text,
6
+ format_entity_count,
7
+ )
8
+ from src.application.image.image import ImageDetector
9
+ from src.application.text.entity import apply_highlight
10
+ from src.application.text.helper import extract_equal_text
11
+ from src.application.text.text import TextDetector
12
+
13
+
14
+ def create_governor_table(
15
+ aligned_sentences_df: DataFrame,
16
+ text: TextDetector,
17
+ image: ImageDetector,
18
+ ):
19
+ rows = []
20
+ rows.append(format_image_governor_row(image))
21
+
22
+ for _, row in aligned_sentences_df.iterrows():
23
+ if row["input"] is None:
24
+ continue
25
+
26
+ if row["source"] is None:
27
+ equal_idx_1 = equal_idx_2 = []
28
+ else:
29
+ # Get index of equal phrases in input and source sentences
30
+ equal_idx_1, equal_idx_2 = extract_equal_text(
31
+ row["input"],
32
+ row["source"],
33
+ )
34
+
35
+ text.governor_table.append(
36
+ [
37
+ row,
38
+ equal_idx_1,
39
+ equal_idx_2,
40
+ row["entities"],
41
+ ],
42
+ )
43
+
44
+ formatted_row = format_text_governor_row(text)
45
+ rows.append(formatted_row)
46
+
47
+ table = "\n".join(rows)
48
+ return f"""
49
+ <h5>Comparison between input news and source news:</h5>
50
+ <table border="1" style="width:100%; text-align:left;">
51
+ <col style="width: 170px;">
52
+ <col style="width: 170px;">
53
+ <col style="width: 30px;">
54
+ <col style="width: 75px;">
55
+ <thead>
56
+ <tr>
57
+ <th>Input news</th>
58
+ <th>Source (URL in Originality)</th>
59
+ <th>Forensic</th>
60
+ <th>Originality</th>
61
+ </tr>
62
+ </thead>
63
+ <tbody>
64
+ {table}
65
+ </tbody>
66
+ </table>
67
+
68
+ <style>
69
+ """
70
+
71
+
72
+ def format_text_governor_row(text):
73
+ input_sentences = ""
74
+ source_sentences = ""
75
+ source_text_urls = ""
76
+ urls = []
77
+ sentence_count = 0
78
+ entity_count = [0, 0] # to get index of [-2]
79
+ for row in text.governor_table:
80
+ if row[0]["input"] is None:
81
+ continue
82
+
83
+ if row[0]["source"] is not None: # source is not empty
84
+ # highlight entities
85
+ input_sentence, highlight_idx_input = apply_highlight(
86
+ row[0]["input"],
87
+ row[3], # entities_with_colors
88
+ "input", # key
89
+ entity_count[-2], # since the last one is for current counting
90
+ )
91
+ source_sentence, highlight_idx_source = apply_highlight(
92
+ row[0]["source"],
93
+ row[3], # entities_with_colors
94
+ "source", # key
95
+ entity_count[-2], # since the last one is for current counting
96
+ )
97
+
98
+ # Color overlapping words
99
+ input_sentence = color_text(
100
+ input_sentence,
101
+ row[1],
102
+ highlight_idx_input,
103
+ ) # text, index of highlight words
104
+ source_sentence = color_text(
105
+ source_sentence,
106
+ row[2],
107
+ highlight_idx_source,
108
+ ) # text, index of highlight words
109
+
110
+ input_sentence = input_sentence.replace(
111
+ "span_style",
112
+ "span style",
113
+ ).replace("1px_4px", "1px 4px")
114
+ source_sentence = source_sentence.replace(
115
+ "span_style",
116
+ "span style",
117
+ ).replace("1px_4px", "1px 4px")
118
+
119
+ else:
120
+ if row[0]["source"] is None:
121
+ source_sentence = ""
122
+ else:
123
+ source_sentence = row[0]["source"]
124
+ input_sentence = row[0]["input"]
125
+
126
+ # convert score to HUMAN-based score:
127
+ input_sentences += input_sentence + "<br><br>"
128
+ source_sentences += source_sentence + "<br><br>"
129
+
130
+ url = row[0]["url"]
131
+ if url not in urls:
132
+ urls.append(url)
133
+ source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
134
+ sentence_count += 1
135
+ if row[3] is not None:
136
+ entity_count.append(len(row[3]))
137
+
138
+ entity_count_text = format_entity_count(sum(entity_count))
139
+ return f"""
140
+ <tr>
141
+ <td>{input_sentences}</td>
142
+ <td>{source_sentences}</td>
143
+ <td>{text.prediction_label[0]}<br>
144
+ ({text.prediction_score[0] * 100:.2f}%)<br><br>
145
+ {entity_count_text}</td>
146
+ <td style="{WORD_BREAK}";>{source_text_urls}</td>
147
+ </tr>
148
+ """
149
+
150
+
151
+ def format_image_governor_row(image):
152
+ if image.referent_url is not None or image.referent_url != "":
153
+ source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
154
+ source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
155
+ else:
156
+ source_image = "Image not found"
157
+ source_image_url = ""
158
+
159
+ return f"""
160
+ <tr>
161
+ <td>input image</td>
162
+ <td>{source_image}</td>
163
+ <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
164
+ <td style="{WORD_BREAK}";>{source_image_url}</td>
165
+ </tr>"""
src/application/formatting_ordinary_user.py CHANGED
@@ -1,10 +1,18 @@
 
 
1
  from src.application.config import WORD_BREAK
 
 
2
 
3
 
4
- def create_ordinary_user_table(self):
 
 
 
 
5
  rows = []
6
- rows.append(self.format_image_ordinary_user_row())
7
- rows.append(self.format_text_ordinary_user_row())
8
  table = "\n".join(rows)
9
 
10
  return f"""
@@ -28,60 +36,56 @@ def create_ordinary_user_table(self):
28
  <style>
29
  """
30
 
31
- def format_text_ordinary_user_row(self):
 
 
 
 
32
  input_sentences = ""
33
- source_text_urls = ""
34
  urls = []
35
- for _, row in self.aligned_sentences_df.iterrows():
36
  if row["input"] is None:
37
  continue
38
-
39
  input_sentences += row["input"] + "<br><br>"
40
  url = row["url"]
41
  if url not in urls:
42
  urls.append(url)
43
- source_text_urls += f"""<a href="{url}">{url}</a><br>"""
44
 
45
  return f"""
46
  <tr>
47
  <td>{input_sentences}</td>
48
- <td>{self.text_prediction_label[0]}<br>
49
- ({self.text_prediction_score[0] * 100:.2f}%)</td>
50
- <td style="{WORD_BREAK}";>{source_text_urls}</td>
51
  </tr>
52
  """
53
 
54
- def format_image_ordinary_user_row(
55
- image_referent_url: str,
56
- image_prediction_label: str,
57
- image_prediction_score: float,
58
- ):
59
  """
60
- Formats an HTML table row for ordinary users,
61
  displaying image analysis results.
62
 
63
  Args:
64
- image_referent_url (str): The URL of the referenced image.
65
- image_prediction_label (str): The predicted label for the image.
66
- image_prediction_score (float): The prediction score for the image.
67
 
68
  Returns:
69
  str: An HTML table row string containing the image analysis results.
70
  """
71
 
72
  # Put image, label, and score into html tag
73
- if (
74
- image_referent_url is not None
75
- or image_referent_url != ""
76
- ):
77
- source_image_url = f"""<a href="{image_referent_url}">{image_referent_url}</a>""" # noqa: E501
78
  else:
79
- source_image_url = ""
80
 
81
  return f"""
82
  <tr>
83
  <td>input image</td>
84
- <td>{image_prediction_label}<br>({image_prediction_score:.2f}%)</td>
85
- <td style="{WORD_BREAK}";>{source_image_url}</td>
86
  </tr>
87
- """
 
1
+ from pandas import DataFrame
2
+
3
  from src.application.config import WORD_BREAK
4
+ from src.application.image.image import ImageDetector
5
+ from src.application.text.text import TextDetector
6
 
7
 
8
+ def create_ordinary_user_table(
9
+ aligned_sentences_df: DataFrame,
10
+ text: TextDetector,
11
+ image: ImageDetector,
12
+ ) -> str:
13
  rows = []
14
+ rows.append(format_image_ordinary_user_row(image))
15
+ rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
16
  table = "\n".join(rows)
17
 
18
  return f"""
 
36
  <style>
37
  """
38
 
39
+
40
+ def format_text_ordinary_user_row(
41
+ aligned_sentences_df,
42
+ text,
43
+ ) -> str:
44
  input_sentences = ""
45
+ source_text_html = ""
46
  urls = []
47
+ for _, row in aligned_sentences_df.iterrows():
48
  if row["input"] is None:
49
  continue
50
+
51
  input_sentences += row["input"] + "<br><br>"
52
  url = row["url"]
53
  if url not in urls:
54
  urls.append(url)
55
+ source_text_html += f"""<a href="{url}">{url}</a><br>"""
56
 
57
  return f"""
58
  <tr>
59
  <td>{input_sentences}</td>
60
+ <td>{text.prediction_label[0]}<br>
61
+ ({text.prediction_score[0] * 100:.2f}%)</td>
62
+ <td style="{WORD_BREAK}";>{source_text_html}</td>
63
  </tr>
64
  """
65
 
66
+
67
+ def format_image_ordinary_user_row(image: ImageDetector) -> str:
 
 
 
68
  """
69
+ Formats an HTML table row for ordinary users,
70
  displaying image analysis results.
71
 
72
  Args:
73
+ image (ImageDetector): The image to be analyzed.
 
 
74
 
75
  Returns:
76
  str: An HTML table row string containing the image analysis results.
77
  """
78
 
79
  # Put image, label, and score into html tag
80
+ if image.referent_url is not None or image.referent_url != "":
81
+ source_image_html = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
 
 
 
82
  else:
83
+ source_image_html = ""
84
 
85
  return f"""
86
  <tr>
87
  <td>input image</td>
88
+ <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
89
+ <td style="{WORD_BREAK}";>{source_image_html}</td>
90
  </tr>
91
+ """
src/application/image/image.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ class ImageDetector:
2
+ def __init__(self):
3
+ self.referent_url: str = None # URL of the referenced image.
4
+ self.prediction_label: str = None
5
+ self.prediction_score: float = None
src/application/text/helper.py CHANGED
@@ -296,11 +296,11 @@ def postprocess_label(labels: list[str]) -> str:
296
  prefix = "Partially generated by "
297
  for index, label in enumerate(labels):
298
  if label.startswith(prefix):
299
- labels[index] = label[len(prefix):]
300
-
301
  labels = list(set(labels))
302
  label = prefix
303
-
304
  if len(labels) == 1:
305
  label += labels[0]
306
  elif len(labels) == 2:
@@ -371,12 +371,14 @@ def split_into_paragraphs(input_text: str) -> list[str]:
371
  return out_paragraphs
372
 
373
 
374
- def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
 
 
375
  """
376
  Extracts start and end indices from a list of dictionaries.
377
 
378
  Args:
379
- colored_idx (list[dict]): A list of dictionaries,
380
  where each dictionary has 'start' and 'end' keys.
381
 
382
  Returns:
@@ -392,19 +394,23 @@ def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
392
  return starts, ends
393
 
394
 
395
- def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]):
 
 
 
 
396
  """
397
  Filters start and end indices to exclude any indices present in the
398
  ignore_indices list.
399
 
400
  Args:
401
  starts (list[int]): A list of starting indices.
402
- ends (list[int]): A list of ending indices.
403
  Must be the same length as starts.
404
  ignore_indices (list[int]): A list of indices to exclude.
405
 
406
  Returns:
407
- A tuple of two lists of integers:
408
  - filtered_starts
409
  - filtered_ends
410
  Returns empty lists if the input is invalid
@@ -454,9 +460,13 @@ def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]
454
  return filtered_starts, filtered_ends
455
 
456
 
457
- def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tuple[list[int], list[int]]:
 
 
 
 
458
  """
459
- Extracts new start and end indices by splitting a range based on
460
  ignored indices.
461
 
462
  Args:
@@ -476,7 +486,7 @@ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tup
476
  new_starts = []
477
  new_ends = []
478
  new_start = start
479
-
480
  # If no indices to ignore, return the original range.
481
  if indexes is None or len(indexes) < 1:
482
  new_starts.append(start)
@@ -489,7 +499,7 @@ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tup
489
  continue
490
  elif index >= end:
491
  continue
492
-
493
  new_starts.append(new_start)
494
  new_ends.append(index)
495
 
@@ -498,4 +508,4 @@ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tup
498
  new_starts.append(new_start)
499
  new_ends.append(end)
500
 
501
- return new_starts, new_ends
 
296
  prefix = "Partially generated by "
297
  for index, label in enumerate(labels):
298
  if label.startswith(prefix):
299
+ labels[index] = label[len(prefix) :]
300
+
301
  labels = list(set(labels))
302
  label = prefix
303
+
304
  if len(labels) == 1:
305
  label += labels[0]
306
  elif len(labels) == 2:
 
371
  return out_paragraphs
372
 
373
 
374
+ def extract_starts_ends(
375
+ colored_idx: list[dict],
376
+ ) -> tuple[list[int], list[int]]:
377
  """
378
  Extracts start and end indices from a list of dictionaries.
379
 
380
  Args:
381
+ colored_idx (list[dict]): A list of dictionaries,
382
  where each dictionary has 'start' and 'end' keys.
383
 
384
  Returns:
 
394
  return starts, ends
395
 
396
 
397
+ def filter_indices(
398
+ starts: list[int],
399
+ ends: list[int],
400
+ ignore_indices: list[int],
401
+ ):
402
  """
403
  Filters start and end indices to exclude any indices present in the
404
  ignore_indices list.
405
 
406
  Args:
407
  starts (list[int]): A list of starting indices.
408
+ ends (list[int]): A list of ending indices.
409
  Must be the same length as starts.
410
  ignore_indices (list[int]): A list of indices to exclude.
411
 
412
  Returns:
413
+ A tuple of two lists of integers:
414
  - filtered_starts
415
  - filtered_ends
416
  Returns empty lists if the input is invalid
 
460
  return filtered_starts, filtered_ends
461
 
462
 
463
+ def extract_new_startend(
464
+ start: int,
465
+ end: int,
466
+ ignore_indices: list[int],
467
+ ) -> tuple[list[int], list[int]]:
468
  """
469
+ Extracts new start and end indices by splitting a range based on
470
  ignored indices.
471
 
472
  Args:
 
486
  new_starts = []
487
  new_ends = []
488
  new_start = start
489
+
490
  # If no indices to ignore, return the original range.
491
  if indexes is None or len(indexes) < 1:
492
  new_starts.append(start)
 
499
  continue
500
  elif index >= end:
501
  continue
502
+
503
  new_starts.append(new_start)
504
  new_ends.append(index)
505
 
 
508
  new_starts.append(new_start)
509
  new_ends.append(end)
510
 
511
+ return new_starts, new_ends
src/application/text/search_detection.py CHANGED
@@ -3,6 +3,7 @@ Author: Khanh Phan
3
  Date: 2024-12-04
4
  """
5
 
 
6
  import warnings
7
 
8
  import numpy as np
@@ -229,7 +230,7 @@ def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
229
  return alignment
230
 
231
 
232
- def determine_label(similarity: float) -> tuple[str | None, bool]:
233
  """
234
  Determines a label and paraphrase status based on the similarity score.
235
 
 
3
  Date: 2024-12-04
4
  """
5
 
6
+ from typing import Optional
7
  import warnings
8
 
9
  import numpy as np
 
230
  return alignment
231
 
232
 
233
+ def determine_label(similarity: float) -> tuple[Optional[str], bool]:
234
  """
235
  Determines a label and paraphrase status based on the similarity score.
236
 
src/application/text/text.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ class TextDetector:
5
+ def __init__(self):
6
+ self.prediction_label: list[str] = ["UNKNOWN"]
7
+ self.prediction_score: list[float] = [0.0]
8
+
9
+ self.grouped_url_df: pd.DataFrame = pd.DataFrame()
10
+
11
+ # For formatting ouput tables
12
+ self.ordinary_user_table: list = []
13
+ self.fact_checker_table: list = []
14
+ self.governor_table: list = []