pmkhanh7890 commited on
Commit
62dc9d8
·
1 Parent(s): 66396a8

add AI tools; replace text baseline model; add Quoc's new algorithm

Browse files
application.py CHANGED
@@ -135,17 +135,27 @@ between the input text and the source.
135
  <thead>
136
  <tr>
137
  <th>Input news</th>
138
- <th>Source (corresponding URL provided in Originality)</th>
139
  <th>Forensic</th>
140
  <th>Originality</th>
141
  </tr>
142
  </thead>
143
  <tbody>
144
  <tr>
145
- <th>TBD</th>
146
- <th>TBD</th>
147
- <th>TBD</th>
148
- <th>TBD</th>
 
 
 
 
 
 
 
 
 
 
149
  </tr>
150
  </tbody>
151
  </table>
 
135
  <thead>
136
  <tr>
137
  <th>Input news</th>
138
+ <th>Source (URL in Originality)</th>
139
  <th>Forensic</th>
140
  <th>Originality</th>
141
  </tr>
142
  </thead>
143
  <tbody>
144
  <tr>
145
+ <td style="border-bottom: 1px solid transparent";>TBD</td>
146
+ <td style="border-bottom: 1px solid transparent";>TBD</td>
147
+ <td rowspan="2">TBD</td>
148
+ <td rowspan="2">TBD</td>
149
+ </tr>
150
+ <tr>
151
+ <td style="border-top: 1px solid transparent";>TBD</td>
152
+ <td style="border-top: 1px solid transparent";>TBD</td>
153
+ </tr>
154
+ <tr>
155
+ <td>TBD</td>
156
+ <td>TBD</td>
157
+ <td>TBD</td>
158
+ <td>TBD</td>
159
  </tr>
160
  </tbody>
161
  </table>
application_2.py CHANGED
@@ -1,254 +0,0 @@
1
- import gradio as gr
2
- import requests
3
- from PIL import Image
4
-
5
- from src.application.content_detection import NewsVerification
6
- from src.application.content_generation import (
7
- generate_fake_image,
8
- generate_fake_text,
9
- replace_text,
10
- )
11
- from src.application.url_reader import URLReader
12
-
13
- AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
14
- AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
15
-
16
-
17
- def load_url(url):
18
- """
19
- Load content from the given URL.
20
- """
21
- content = URLReader(url)
22
- image = None
23
- header = {
24
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
25
- }
26
- try:
27
- response = requests.get(
28
- url,
29
- headers=header,
30
- stream=True,
31
- )
32
- response.raise_for_status() # Raise an exception for bad status codes
33
-
34
- image_response = requests.get(content.top_image, stream=True)
35
- try:
36
- image = Image.open(image_response.raw)
37
- except OSError as e:
38
- print(f"Error loading image from {content.top_image}: {e}")
39
-
40
- except (requests.exceptions.RequestException, FileNotFoundError) as e:
41
- print(f"Error fetching image: {e}")
42
-
43
- return content.title, content.text, image
44
-
45
-
46
- def generate_analysis_report(
47
- news_title: str,
48
- news_content: str,
49
- news_image: Image,
50
- ):
51
- news_analysis = NewsVerification()
52
- news_analysis.load_news(news_title, news_content, news_image)
53
- news_analysis.generate_analysis_report()
54
- return news_analysis.analyze_details()
55
-
56
-
57
- # Define the GUI
58
- with gr.Blocks() as demo:
59
- gr.Markdown("# NEWS VERIFICATION")
60
-
61
- with gr.Row():
62
- # SETTINGS
63
- with gr.Column(scale=1):
64
- with gr.Accordion("1. Enter a URL"):
65
- url_input = gr.Textbox(
66
- label="",
67
- show_label=False,
68
- value="",
69
- )
70
- load_button = gr.Button("Load URL")
71
-
72
- with gr.Accordion(
73
- "2. Select content-generation models",
74
- open=True,
75
- visible=False,
76
- ):
77
- with gr.Row():
78
- text_generation_model = gr.Dropdown(
79
- choices=AZURE_TEXT_MODEL,
80
- label="Text-generation model",
81
- )
82
- image_generation_model = gr.Dropdown(
83
- choices=AZURE_IMAGE_MODEL,
84
- label="Image-generation model",
85
- )
86
- generate_text_button = gr.Button("Generate text")
87
- generate_image_button = gr.Button("Generate image")
88
-
89
- with gr.Accordion(
90
- "3. Replace any terms",
91
- open=True,
92
- visible=False,
93
- ):
94
- replace_df = gr.Dataframe(
95
- headers=["Find what:", "Replace with:"],
96
- datatype=["str", "str"],
97
- row_count=(1, "dynamic"),
98
- col_count=(2, "fixed"),
99
- interactive=True,
100
- )
101
- replace_button = gr.Button("Replace all")
102
-
103
- # GENERATED CONTENT
104
- with gr.Accordion("Input News"):
105
- news_title = gr.Textbox(label="Title", value="")
106
- news_image = gr.Image(label="Image", type="filepath")
107
- news_content = gr.Textbox(label="Content", value="", lines=13)
108
-
109
- # NEWS ANALYSIS REPORT
110
- ordinary_user_explanation = """
111
- FOR ORDINARY USER<br>
112
- - Green texts are the matched words in the input and source news.<br>
113
- - Each highlighted pair (marked with a number) shows the key differences
114
- between the input text and the source.
115
- """
116
- fact_checker_explanation = """
117
- FOR FACT CHECKER<br>
118
- - Green texts are the matched words in the input and source news.<br>
119
- - Each highlighted pair (marked with a number) shows the key differences
120
- between the input text and the source.
121
- """
122
- governor_explanation = """
123
- FOR GOVERNOR<br>
124
- - Green texts are the matched words in the input and source news.<br>
125
- - Each highlighted pair (marked with a number) shows the key differences
126
- between the input text and the source.
127
- """
128
- table = """
129
- <h5>Comparison between input news and source news:</h5>
130
- <table border="1" style="width:100%; text-align:left;">
131
- <col style="width: 170px;">
132
- <col style="width: 170px;">
133
- <col style="width: 30px;">
134
- <col style="width: 75px;">
135
- <thead>
136
- <tr>
137
- <th>Input news</th>
138
- <th>Source (corresponding URL provided in Originality)</th>
139
- <th>Forensic</th>
140
- <th>Originality</th>
141
- </tr>
142
- </thead>
143
- <tbody>
144
- <tr>
145
- <th>TBD</th>
146
- <th>TBD</th>
147
- <th>TBD</th>
148
- <th>TBD</th>
149
- </tr>
150
- </tbody>
151
- </table>
152
-
153
- <style>"""
154
- with gr.Column(scale=2):
155
- with gr.Accordion("NEWS ANALYSIS"):
156
- verification_button = gr.Button("Verify news")
157
- with gr.Tab("Orinary User"):
158
- gr.HTML(ordinary_user_explanation)
159
- ordinary_user_result = gr.HTML(table)
160
- with gr.Tab("Fact Checker"):
161
- gr.HTML(fact_checker_explanation)
162
- fact_checker_result = gr.HTML(table)
163
- with gr.Tab("Governor"):
164
- gr.HTML(governor_explanation)
165
- governor_result = gr.HTML(table)
166
-
167
- # Connect events
168
- load_button.click(
169
- load_url,
170
- inputs=url_input,
171
- outputs=[news_title, news_content, news_image],
172
- )
173
- replace_button.click(
174
- replace_text,
175
- inputs=[news_title, news_content, replace_df],
176
- outputs=[news_title, news_content],
177
- )
178
- generate_text_button.click(
179
- generate_fake_text,
180
- inputs=[text_generation_model, news_title, news_content],
181
- outputs=[news_title, news_content],
182
- )
183
- generate_image_button.click(
184
- generate_fake_image,
185
- inputs=[image_generation_model, news_title],
186
- outputs=[news_image],
187
- )
188
- verification_button.click(
189
- generate_analysis_report,
190
- inputs=[news_title, news_content, news_image],
191
- outputs=[ordinary_user_result, fact_checker_result, governor_result],
192
- )
193
-
194
- # change Image
195
- # url_input.change(load_image, inputs=url_input, outputs=image_view)
196
-
197
- try:
198
- with open(
199
- "examples/example_text_real.txt",
200
- encoding="utf-8",
201
- ) as file:
202
- text_real_1 = file.read()
203
- with open(
204
- "examples/example_text_real_2.txt",
205
- encoding="utf-8",
206
- ) as file:
207
- text_real_2 = file.read()
208
- with open(
209
- "examples/example_text_LLM_topic.txt",
210
- encoding="utf-8",
211
- ) as file:
212
- text_llm_topic = file.read()
213
- with open(
214
- "examples/example_text_LLM_modification.txt",
215
- encoding="utf-8",
216
- ) as file:
217
- text_llm_modification = file.read()
218
- with open(
219
- "examples/example_text_LLM_entities.txt",
220
- encoding="utf-8",
221
- ) as file:
222
- text_llm_entities = file.read()
223
- except FileNotFoundError:
224
- print("File not found.")
225
- except Exception as e:
226
- print(f"An error occurred: {e}")
227
-
228
- title_1 = "Southampton news: Leeds target striker Cameron Archer."
229
- title_2 = "Southampton news: Leeds target striker Cameron Archer."
230
- title_4 = "Japan pledges support for Ukraine with 100-year pact."
231
-
232
- image_1 = "examples/example_image_real_1.jpg.webp"
233
- image_2 = "examples/example_image_real_2.jpg.webp"
234
- image_3 = "examples/example_image_real_3.jpg"
235
- image_4 = "examples/example_image_real_4.jpg.webp"
236
-
237
- gr.Examples(
238
- examples=[
239
- [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
240
- [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
241
- [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
242
- [title_4, image_4, text_llm_entities],
243
- ],
244
- inputs=[news_title, news_image, news_content],
245
- label="Examples",
246
- example_labels=[
247
- "2 real news",
248
- "1 real news + 1 LLM modification-based news",
249
- "1 real news + 1 LLM topic-based news",
250
- "1 LLM changed-entities news",
251
- ],
252
- )
253
-
254
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
application_3.py DELETED
@@ -1,254 +0,0 @@
1
- import gradio as gr
2
- import requests
3
- from PIL import Image
4
-
5
- from src.application.content_detection import NewsVerification
6
- from src.application.content_generation import (
7
- generate_fake_image,
8
- generate_fake_text,
9
- replace_text,
10
- )
11
- from src.application.url_reader import URLReader
12
-
13
- AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
14
- AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
15
-
16
-
17
- def load_url(url):
18
- """
19
- Load content from the given URL.
20
- """
21
- content = URLReader(url)
22
- image = None
23
- header = {
24
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
25
- }
26
- try:
27
- response = requests.get(
28
- url,
29
- headers=header,
30
- stream=True,
31
- )
32
- response.raise_for_status() # Raise an exception for bad status codes
33
-
34
- image_response = requests.get(content.top_image, stream=True)
35
- try:
36
- image = Image.open(image_response.raw)
37
- except OSError as e:
38
- print(f"Error loading image from {content.top_image}: {e}")
39
-
40
- except (requests.exceptions.RequestException, FileNotFoundError) as e:
41
- print(f"Error fetching image: {e}")
42
-
43
- return content.title, content.text, image
44
-
45
-
46
- def generate_analysis_report(
47
- news_title: str,
48
- news_content: str,
49
- news_image: Image,
50
- ):
51
- news_analysis = NewsVerification()
52
- news_analysis.load_news(news_title, news_content, news_image)
53
- news_analysis.generate_analysis_report()
54
- return news_analysis.analyze_details()
55
-
56
-
57
- # Define the GUI
58
- with gr.Blocks() as demo:
59
- gr.Markdown("# NEWS VERIFICATION")
60
-
61
- with gr.Row():
62
- # SETTINGS
63
- with gr.Column(scale=1):
64
- with gr.Accordion("1. Enter a URL"):
65
- url_input = gr.Textbox(
66
- label="",
67
- show_label=False,
68
- value="",
69
- )
70
- load_button = gr.Button("Load URL")
71
-
72
- with gr.Accordion(
73
- "2. Select content-generation models",
74
- open=True,
75
- visible=False,
76
- ):
77
- with gr.Row():
78
- text_generation_model = gr.Dropdown(
79
- choices=AZURE_TEXT_MODEL,
80
- label="Text-generation model",
81
- )
82
- image_generation_model = gr.Dropdown(
83
- choices=AZURE_IMAGE_MODEL,
84
- label="Image-generation model",
85
- )
86
- generate_text_button = gr.Button("Generate text")
87
- generate_image_button = gr.Button("Generate image")
88
-
89
- with gr.Accordion(
90
- "3. Replace any terms",
91
- open=True,
92
- visible=False,
93
- ):
94
- replace_df = gr.Dataframe(
95
- headers=["Find what:", "Replace with:"],
96
- datatype=["str", "str"],
97
- row_count=(1, "dynamic"),
98
- col_count=(2, "fixed"),
99
- interactive=True,
100
- )
101
- replace_button = gr.Button("Replace all")
102
-
103
- # GENERATED CONTENT
104
- with gr.Accordion("Input News"):
105
- news_title = gr.Textbox(label="Title", value="")
106
- news_image = gr.Image(label="Image", type="filepath")
107
- news_content = gr.Textbox(label="Content", value="", lines=13)
108
-
109
- # NEWS ANALYSIS REPORT
110
- ordinary_user_explanation = """
111
- FOR ORDINARY USER<br>
112
- - Green texts are the matched words in the input and source news.<br>
113
- - Each highlighted pair (marked with a number) shows the key differences
114
- between the input text and the source.
115
- """
116
- fact_checker_explanation = """
117
- FOR FACT CHECKER<br>
118
- - Green texts are the matched words in the input and source news.<br>
119
- - Each highlighted pair (marked with a number) shows the key differences
120
- between the input text and the source.
121
- """
122
- governor_explanation = """
123
- FOR GOVERNOR<br>
124
- - Green texts are the matched words in the input and source news.<br>
125
- - Each highlighted pair (marked with a number) shows the key differences
126
- between the input text and the source.
127
- """
128
- table = """
129
- <h5>Comparison between input news and source news:</h5>
130
- <table border="1" style="width:100%; text-align:left;">
131
- <col style="width: 170px;">
132
- <col style="width: 170px;">
133
- <col style="width: 30px;">
134
- <col style="width: 75px;">
135
- <thead>
136
- <tr>
137
- <th>Input news</th>
138
- <th>Source (corresponding URL provided in Originality)</th>
139
- <th>Forensic</th>
140
- <th>Originality</th>
141
- </tr>
142
- </thead>
143
- <tbody>
144
- <tr>
145
- <th>TBD</th>
146
- <th>TBD</th>
147
- <th>TBD</th>
148
- <th>TBD</th>
149
- </tr>
150
- </tbody>
151
- </table>
152
-
153
- <style>"""
154
- with gr.Column(scale=2):
155
- with gr.Accordion("NEWS ANALYSIS"):
156
- verification_button = gr.Button("Verify news")
157
- with gr.Tab("Orinary User"):
158
- gr.HTML(ordinary_user_explanation)
159
- ordinary_user_result = gr.HTML(table)
160
- with gr.Tab("Fact Checker"):
161
- gr.HTML(fact_checker_explanation)
162
- fact_checker_result = gr.HTML(table)
163
- with gr.Tab("Governor"):
164
- gr.HTML(governor_explanation)
165
- governor_result = gr.HTML(table)
166
-
167
- # Connect events
168
- load_button.click(
169
- load_url,
170
- inputs=url_input,
171
- outputs=[news_title, news_content, news_image],
172
- )
173
- replace_button.click(
174
- replace_text,
175
- inputs=[news_title, news_content, replace_df],
176
- outputs=[news_title, news_content],
177
- )
178
- generate_text_button.click(
179
- generate_fake_text,
180
- inputs=[text_generation_model, news_title, news_content],
181
- outputs=[news_title, news_content],
182
- )
183
- generate_image_button.click(
184
- generate_fake_image,
185
- inputs=[image_generation_model, news_title],
186
- outputs=[news_image],
187
- )
188
- verification_button.click(
189
- generate_analysis_report,
190
- inputs=[news_title, news_content, news_image],
191
- outputs=[ordinary_user_result, fact_checker_result, governor_result],
192
- )
193
-
194
- # change Image
195
- # url_input.change(load_image, inputs=url_input, outputs=image_view)
196
-
197
- try:
198
- with open(
199
- "examples/example_text_real.txt",
200
- encoding="utf-8",
201
- ) as file:
202
- text_real_1 = file.read()
203
- with open(
204
- "examples/example_text_real_2.txt",
205
- encoding="utf-8",
206
- ) as file:
207
- text_real_2 = file.read()
208
- with open(
209
- "examples/example_text_LLM_topic.txt",
210
- encoding="utf-8",
211
- ) as file:
212
- text_llm_topic = file.read()
213
- with open(
214
- "examples/example_text_LLM_modification.txt",
215
- encoding="utf-8",
216
- ) as file:
217
- text_llm_modification = file.read()
218
- with open(
219
- "examples/example_text_LLM_entities.txt",
220
- encoding="utf-8",
221
- ) as file:
222
- text_llm_entities = file.read()
223
- except FileNotFoundError:
224
- print("File not found.")
225
- except Exception as e:
226
- print(f"An error occurred: {e}")
227
-
228
- title_1 = "Southampton news: Leeds target striker Cameron Archer."
229
- title_2 = "Southampton news: Leeds target striker Cameron Archer."
230
- title_4 = "Japan pledges support for Ukraine with 100-year pact."
231
-
232
- image_1 = "examples/example_image_real_1.jpg.webp"
233
- image_2 = "examples/example_image_real_2.jpg.webp"
234
- image_3 = "examples/example_image_real_3.jpg"
235
- image_4 = "examples/example_image_real_4.jpg.webp"
236
-
237
- gr.Examples(
238
- examples=[
239
- [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
240
- [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
241
- [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
242
- [title_4, image_4, text_llm_entities],
243
- ],
244
- inputs=[news_title, news_image, news_content],
245
- label="Examples",
246
- example_labels=[
247
- "2 real news",
248
- "1 real news + 1 LLM modification-based news",
249
- "1 real news + 1 LLM topic-based news",
250
- "1 LLM changed-entities news",
251
- ],
252
- )
253
-
254
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt_test.py CHANGED
@@ -76,12 +76,12 @@ azure_client = AzureOpenAI(
76
  api_version="2024-05-01-preview",
77
  )
78
 
79
- deplopment_name = "gpt-4o" # "o1-mini" # or "gpt-4o"
80
  TEXT_PROMPT = """
81
  Paraphrase the following news, only output the paraphrased text:
82
 
83
  """
84
- text = get_first_column("data/bbc_news.csv")
85
  count = 0
86
  for index, news in enumerate(text):
87
  if count > 1000:
@@ -107,4 +107,4 @@ for index, news in enumerate(text):
107
  count += 1
108
  paraphrased_news = response.choices[0].message.content
109
 
110
- add_text_to_csv("data/bbc_news_4o.csv", paraphrased_news, count)
 
76
  api_version="2024-05-01-preview",
77
  )
78
 
79
+ deplopment_name = "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
80
  TEXT_PROMPT = """
81
  Paraphrase the following news, only output the paraphrased text:
82
 
83
  """
84
+ text = get_first_column("data/MAGE.csv")
85
  count = 0
86
  for index, news in enumerate(text):
87
  if count > 1000:
 
107
  count += 1
108
  paraphrased_news = response.choices[0].message.content
109
 
110
+ add_text_to_csv("data/MAGE_4o_mini.csv", paraphrased_news, count)
src/application/content_detection.py CHANGED
@@ -13,11 +13,10 @@ from src.application.text.entity import (
13
  highlight_entities,
14
  )
15
  from src.application.text.helper import extract_equal_text
16
- from src.application.text.model_detection import detect_text_by_ai_model
17
  from src.application.text.preprocessing import split_into_paragraphs
18
  from src.application.text.search_detection import (
19
- check_human,
20
- detect_text_by_relative_search,
21
  find_paragraph_source,
22
  )
23
 
@@ -29,18 +28,21 @@ class NewsVerification:
29
  self.news_content = ""
30
  self.news_image = ""
31
 
32
- self.text_prediction_label: list[str] = []
33
- self.text_prediction_score: list[float] = []
34
- self.text_referent_url: list[str] = []
35
- self.image_prediction_label: list[str] = []
36
- self.image_prediction_score: list[str] = []
37
  self.image_referent_url: list[str] = []
 
38
  self.news_prediction_label = ""
39
  self.news_prediction_score = -1
40
 
 
41
  self.found_img_url: list[str] = []
42
- self.aligned_sentences: list[dict] = []
43
- self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
 
44
  columns=[
45
  "input",
46
  "source",
@@ -52,32 +54,78 @@ class NewsVerification:
52
  "entities",
53
  ],
54
  )
55
- self.is_paraphrased: list[bool] = []
56
 
 
57
  self.ordinary_user_table: list = []
58
  self.fact_checker_table: list = []
59
  self.governor_table: list = []
60
- self.entities_with_colors = []
61
 
62
  def load_news(self, news_title, news_content, news_image):
63
- self.news_text = news_title + "\n\n" + news_content
64
  self.news_title = news_title
65
  self.news_content = news_content
66
  self.news_image = news_image
67
 
68
  def determine_text_origin(self):
69
  self.find_text_source()
70
- label, score = self.verify_text()
71
- if label == "UNKNOWN":
72
- # Concatenate text from "input" in sentence_df
73
- print(self.aligned_sentences_df["input"])
74
- text = " ".join(self.aligned_sentences_df["input"].tolist())
75
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # detect by baseline model
77
- label, score = detect_text_by_ai_model(text)
78
-
79
- return label, score
80
-
81
  def find_text_source(self):
82
  """
83
  Determines the origin of the given text based on paraphrasing detection
@@ -99,8 +147,8 @@ class NewsVerification:
99
  # Setup df for input_sentences
100
 
101
  for _ in range(len(input_sentences)):
102
- self.aligned_sentences_df = pd.concat(
103
- [self.aligned_sentences_df, pd.DataFrame([{
104
  "input": None,
105
  "source": None,
106
  "label": None,
@@ -113,36 +161,63 @@ class NewsVerification:
113
  )
114
 
115
  # find a source for each paragraph
116
- for index, sentence in enumerate(input_sentences):
117
- if self.aligned_sentences_df.loc[index, "url"] is not None:
118
- continue
 
 
119
 
120
- print(f"-------index = {index}-------")
121
- print(f"current_sentence = {input_sentences[index]}")
122
 
123
- self.aligned_sentences_df, img_urls = find_paragraph_source(
124
  input_sentences,
125
  index,
126
- self.aligned_sentences_df,
127
  )
128
 
129
  self.found_img_url.extend(img_urls)
130
 
131
  # determine if the whole source is from a news or not
132
 
133
- def verify_text(self):
 
 
134
  # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
135
- filtered_by_similarity = self.aligned_sentences_df[
136
- self.aligned_sentences_df["similarity"] > 0.8
137
  ]
138
- if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 2:
139
- avg_similarity = filtered_by_similarity.similarity.mean()
140
- if avg_similarity > 0.963:
141
- return "HUMAN", avg_similarity
142
- if avg_similarity > 0.8:
143
- return "MACHINE", avg_similarity
144
-
145
- return "UNKNOWN", 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  def determine_image_origin(self):
148
  print("CHECK IMAGE:")
@@ -152,14 +227,12 @@ class NewsVerification:
152
  self.image_referent_url = None
153
  return
154
 
155
- for image in self.found_img_url:
156
- print(f"\tfound_img_url: {image}")
157
  matched_url, similarity = detect_image_from_news_image(
158
  self.news_image,
159
  self.found_img_url,
160
  )
161
  if matched_url is not None:
162
- print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
163
  self.image_prediction_label = "HUMAN"
164
  self.image_prediction_score = similarity
165
  self.image_referent_url = matched_url
@@ -169,7 +242,7 @@ class NewsVerification:
169
  self.news_image,
170
  )
171
  if matched_url is not None:
172
- print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
173
  self.image_prediction_label = "HUMAN"
174
  self.image_prediction_score = similarity
175
  self.image_referent_url = matched_url
@@ -187,50 +260,35 @@ class NewsVerification:
187
  self.image_prediction_score = 50
188
  self.image_referent_url = None
189
 
190
- def determine_news_origin(self):
191
- if self.text_prediction_label == "MACHINE":
192
- text_prediction_score = 100 - self.text_prediction_score
193
- elif self.text_prediction_label == "UNKNOWN":
194
- text_prediction_score = 50
195
- else:
196
- text_prediction_score = self.text_prediction_score
197
-
198
- if self.image_prediction_label == "MACHINE":
199
- image_prediction_score = 100 - self.image_prediction_score
200
- elif self.image_prediction_label == "UNKNOWN":
201
- image_prediction_score = 50
202
- else:
203
- image_prediction_score = self.image_prediction_score
204
-
205
- news_prediction_score = (
206
- text_prediction_score + image_prediction_score
207
- ) / 2
208
- if news_prediction_score > 50:
209
- self.news_prediction_score = news_prediction_score
210
- self.news_prediction_label = "HUMAN"
211
- else:
212
- self.news_prediction_score = 100 - news_prediction_score
213
- self.news_prediction_label = "MACHINE"
214
-
215
  def generate_analysis_report(self):
216
- self.determine_text_origin()
217
- self.determine_image_origin()
 
 
218
 
219
  def analyze_details(self):
 
 
 
 
 
 
 
 
220
  entities_with_colors = []
221
- for index, aligned_sentence in enumerate(self.aligned_sentences):
222
  # Get entity-words (in pair) with colors
223
  entities_with_colors = highlight_entities(
224
- aligned_sentence["input"],
225
- aligned_sentence["source"],
226
  )
227
- self.aligned_sentences[index]["entities"] = entities_with_colors
228
 
229
- ordinary_user_table = self.create_ordinary_user_table()
230
- fact_checker_table = self.create_fact_checker_table()
231
- governor_table = self.create_governor_table()
 
 
232
 
233
- return ordinary_user_table, fact_checker_table, governor_table
234
 
235
  def get_text_urls(self):
236
  return set(self.text_referent_url)
@@ -277,33 +335,52 @@ class NewsVerification:
277
  max_length = 30 # TODO: put this in configuration
278
  rows.append(self.format_image_fact_checker_row(max_length))
279
 
280
- for aligned_sentence in self.aligned_sentences:
281
- if "input" not in aligned_sentence:
282
  continue
283
-
284
- # Get index of equal phrases in input and source sentences
285
- equal_idx_1, equal_idx_2 = extract_equal_text(
286
- aligned_sentence["input"],
287
- aligned_sentence["source"],
288
- )
289
-
290
- # Get entity-words (in pair) with colors
291
- # entities_with_colors = highlight_entities(
292
- # aligned_sentence["input"],
293
- # aligned_sentence["source"],
294
- # )
295
 
296
  self.fact_checker_table.append(
297
  [
298
- aligned_sentence,
299
  equal_idx_1,
300
  equal_idx_2,
301
- aligned_sentence["entities"],
 
302
  ],
303
  )
304
-
305
- for row in self.fact_checker_table:
306
- formatted_row = self.format_text_fact_checker_row(row, max_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  rows.append(formatted_row)
308
 
309
  table = "\n".join(rows)
@@ -317,7 +394,7 @@ class NewsVerification:
317
  <thead>
318
  <tr>
319
  <th>Input news</th>
320
- <th>Source (corresponding URL provided in Originality)</th>
321
  <th>Forensic</th>
322
  <th>Originality</th>
323
  </tr>
@@ -330,23 +407,38 @@ class NewsVerification:
330
  <style>
331
  """
332
 
333
- def format_text_fact_checker_row(self, row, max_length=30):
 
 
 
 
 
 
 
334
  entity_count = 0
335
- if row[0]["input"] == "":
336
  return ""
337
- if row[0]["source"] != "": # source is not empty
338
- # highlight entities
339
- input_sentence, highlight_idx_input = apply_highlight(
340
- row[0]["input"],
341
- row[3],
342
- "input",
343
- )
344
- source_sentence, highlight_idx_source = apply_highlight(
345
- row[0]["source"],
346
- row[3],
347
- "source",
348
- )
349
- entity_count = len(row[3])
 
 
 
 
 
 
 
 
350
 
351
  # Color overlapping words
352
  input_sentence = self.color_text(
@@ -360,6 +452,8 @@ class NewsVerification:
360
  highlight_idx_source,
361
  ) # text, index of highlight words
362
 
 
 
363
  input_sentence = input_sentence.replace(
364
  "span_style",
365
  "span style",
@@ -372,23 +466,69 @@ class NewsVerification:
372
  input_sentence = row[0]["input"]
373
  source_sentence = row[0]["source"]
374
 
375
- label = row[0]["label"]
376
- score = row[0]["similarity"]
377
-
378
- url = row[0]["url"] #
 
 
 
 
 
 
 
 
 
 
379
  short_url = self.shorten_url(url, max_length)
380
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
381
 
 
382
  entity_count_text = self.get_entity_count_text(entity_count)
383
-
384
- return f"""
385
- <tr>
386
- <td>{input_sentence}</td>
387
- <td>{source_sentence}</td>
388
- <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td> # noqa: E501
389
- <td>{source_text_url}</td>
390
- </tr>
391
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  def format_image_fact_checker_row(self, max_length=30):
394
 
@@ -396,7 +536,7 @@ class NewsVerification:
396
  self.image_referent_url is not None
397
  or self.image_referent_url != ""
398
  ):
399
- source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" # noqa: E501
400
  short_url = self.shorten_url(self.image_referent_url, max_length)
401
  source_image_url = (
402
  f"""<a href="{self.image_referent_url}">{short_url}</a>"""
@@ -418,7 +558,6 @@ class NewsVerification:
418
  <h5>Comparison between input news and source news:</h5>
419
  <table border="1" style="width:100%; text-align:left;">
420
  <col style="width: 170px;">
421
- <col style="width: 170px;">
422
  <col style="width: 30px;">
423
  <col style="width: 75px;">
424
  <thead>
@@ -439,26 +578,22 @@ class NewsVerification:
439
  def format_text_ordinary_user_row(self, max_length=30):
440
  input_sentences = ""
441
  source_text_urls = ""
442
- label = ""
443
- scores = 0
444
- sentence_count = 0
445
- for index, row in enumerate(self.aligned_sentences):
446
- if row["input"] == "":
447
  continue
448
  input_sentences += row["input"] + "<br><br>"
449
- label = self.aligned_sentences[index]["label"]
450
-
451
- url = self.aligned_sentences[index]["url"] #
452
- short_url = self.shorten_url(url, max_length)
453
- source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
454
- sentence_count += 1
455
-
456
- scores, label = self.calculate_score_label()
457
 
458
  return f"""
459
  <tr>
460
  <td>{input_sentences}</td>
461
- <td>{label}<br>({scores * 100:.2f}%)</td>
 
462
  <td>{source_text_urls}</td>
463
  </tr>
464
  """
@@ -484,28 +619,26 @@ class NewsVerification:
484
  max_length = 30 # TODO: put this in configuration
485
  rows.append(self.format_image_governor_row(max_length))
486
 
487
- for aligned_sentence in self.aligned_sentences:
488
- if "input" not in aligned_sentence:
489
  continue
490
-
491
- # Get index of equal phrases in input and source sentences
492
- equal_idx_1, equal_idx_2 = extract_equal_text(
493
- aligned_sentence["input"],
494
- aligned_sentence["source"],
495
- )
496
-
497
- # Get entity-words (in pair) with colors
498
- # entities_with_colors = highlight_entities(
499
- # aligned_sentence["input"],
500
- # aligned_sentence["source"],
501
- # )
502
 
503
  self.governor_table.append(
504
  [
505
- aligned_sentence,
506
  equal_idx_1,
507
  equal_idx_2,
508
- aligned_sentence["entities"],
509
  ],
510
  )
511
 
@@ -523,7 +656,7 @@ class NewsVerification:
523
  <thead>
524
  <tr>
525
  <th>Input news</th>
526
- <th>Source (corresponding URL provided in Originality)</th>
527
  <th>Forensic</th>
528
  <th>Originality</th>
529
  </tr>
@@ -540,29 +673,27 @@ class NewsVerification:
540
  input_sentences = ""
541
  source_sentences = ""
542
  source_text_urls = ""
543
- label = ""
544
  sentence_count = 0
545
- entity_count = 0
546
  for row in self.governor_table:
547
- print(f"governor_row: {row}")
548
- if row[0]["input"] == "":
549
  continue
550
 
551
- if row[0]["source"] != "": # source is not empty
552
  # highlight entities
553
  input_sentence, highlight_idx_input = apply_highlight(
554
  row[0]["input"],
555
- row[3],
556
- "input",
557
- entity_count,
558
  )
559
  source_sentence, highlight_idx_source = apply_highlight(
560
  row[0]["source"],
561
- row[3],
562
- "source",
563
- entity_count,
564
  )
565
- entity_count += len(row[3])
566
 
567
  # Color overlapping words
568
  input_sentence = self.color_text(
@@ -586,26 +717,35 @@ class NewsVerification:
586
  ).replace("1px_4px", "1px 4px")
587
 
588
  else:
 
 
 
 
589
  input_sentence = row[0]["input"]
590
- source_sentence = row[0]["source"]
591
 
592
  # convert score to HUMAN-based score:
593
  input_sentences += input_sentence + "<br><br>"
594
  source_sentences += source_sentence + "<br><br>"
595
-
596
  url = row[0]["url"]
597
- short_url = self.shorten_url(url, max_length)
598
- source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
599
- sentence_count += 1
600
-
601
- score, label = self.calculate_score_label()
602
- entity_count_text = self.get_entity_count_text(entity_count)
 
 
 
603
 
604
  return f"""
605
  <tr>
606
  <td>{input_sentences}</td>
607
  <td>{source_sentences}</td>
608
- <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>
 
 
609
  <td>{source_text_urls}</td>
610
  </tr>
611
  """
@@ -615,7 +755,7 @@ class NewsVerification:
615
  self.image_referent_url is not None
616
  or self.image_referent_url != ""
617
  ):
618
- source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" # noqa: E501
619
  short_url = self.shorten_url(self.image_referent_url, max_length)
620
  source_image_url = (
621
  f"""<a href="{self.image_referent_url}">{short_url}</a>"""
@@ -630,7 +770,7 @@ class NewsVerification:
630
  if entity_count <= 0:
631
  entity_count_text = ""
632
  elif entity_count == 1:
633
- entity_count_text = "with altered entity"
634
  else:
635
  entity_count_text = "with altered entities"
636
  return entity_count_text
@@ -651,7 +791,7 @@ class NewsVerification:
651
 
652
  starts, ends = self.extract_starts_ends(colored_idx)
653
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
654
-
655
  previous_end = 0
656
  for start, end in zip(starts, ends):
657
  paragraph += " ".join(words[previous_end:start])
@@ -661,13 +801,7 @@ class NewsVerification:
661
 
662
  previous_end = end
663
 
664
- # Some left words due to the punctuation separated from
665
- # the highlighting text
666
- equal_words = " ".join(words[previous_end:])
667
- print(f"starts_2: {previous_end}")
668
- print(f"ends_2: {len(words) - 1}")
669
- print(f"equal_words: {words[previous_end:]}")
670
- paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
671
 
672
  return paragraph
673
 
@@ -750,38 +884,12 @@ class NewsVerification:
750
  end = number
751
  else:
752
  starts.append(start)
753
- ends.append(end + 1)
754
  start = number
755
  end = number
756
 
757
  if i == len(numbers) - 1:
758
  starts.append(start)
759
- ends.append(end + 1)
760
-
761
- return starts, ends
762
 
763
- def calculate_score_label(self):
764
- human_score = []
765
- machine_score = []
766
- machine_flag = False
767
- for sentence in self.aligned_sentences:
768
- if sentence["input"] == "":
769
- continue
770
- if sentence["label"] == "HUMAN":
771
- human_score.append(sentence["similarity"])
772
- elif sentence["label"] == "MACHINE":
773
- machine_score.append(1 - sentence["similarity"])
774
- machine_flag = True
775
-
776
- if machine_flag is True and len(machine_score) > 0:
777
- # average value of machine_score
778
- machine_score_avg = sum(machine_score) / len(machine_score)
779
- if machine_score_avg < 0.5:
780
- machine_score_avg = 1 - machine_score_avg
781
- return machine_score_avg, "MACHINE"
782
- elif machine_flag is False and len(human_score) > 0:
783
- # average value of human_score
784
- human_score_avg = sum(human_score) / len(human_score)
785
- return human_score_avg, "HUMAN"
786
- else:
787
- return 0, "UNKNOWN"
 
13
  highlight_entities,
14
  )
15
  from src.application.text.helper import extract_equal_text
16
+ from src.application.text.model_detection import detect_text_by_ai_model, predict_generation_model
17
  from src.application.text.preprocessing import split_into_paragraphs
18
  from src.application.text.search_detection import (
19
+ PARAPHRASE_THRESHOLD_MACHINE,
 
20
  find_paragraph_source,
21
  )
22
 
 
28
  self.news_content = ""
29
  self.news_image = ""
30
 
31
+ self.text_prediction_label: list[str] = ["UNKNOWN"]
32
+ self.text_prediction_score: list[float] = [0.0]
33
+
34
+ self.image_prediction_label: list[str] = ["UNKNOWN"]
35
+ self.image_prediction_score: list[str] = [0.0]
36
  self.image_referent_url: list[str] = []
37
+
38
  self.news_prediction_label = ""
39
  self.news_prediction_score = -1
40
 
41
+ # news' urls to find img
42
  self.found_img_url: list[str] = []
43
+
44
+ # Analyzed results
45
+ self.aligned_paragraphs_df: pd.DataFrame = pd.DataFrame(
46
  columns=[
47
  "input",
48
  "source",
 
54
  "entities",
55
  ],
56
  )
57
+ self.grouped_url_df: pd.DataFrame = pd.DataFrame()
58
 
59
+ # For formatting ouput tables
60
  self.ordinary_user_table: list = []
61
  self.fact_checker_table: list = []
62
  self.governor_table: list = []
 
63
 
64
  def load_news(self, news_title, news_content, news_image):
65
+ self.news_text = (news_title + "\n\n" + news_content).strip()
66
  self.news_title = news_title
67
  self.news_content = news_content
68
  self.news_image = news_image
69
 
70
  def determine_text_origin(self):
71
  self.find_text_source()
72
+
73
+ # Group inout and source by url
74
+ def concat_text(series):
75
+ return ' '.join(series.astype(str).tolist()) #Handle mixed data types and NaNs
76
+
77
+ self.grouped_url_df = self.aligned_paragraphs_df.groupby('url').agg(
78
+ {
79
+ 'input': concat_text,
80
+ 'source': concat_text,
81
+ }
82
+ )
83
+ self.grouped_url_df = self.grouped_url_df.reset_index()
84
+ # Add new columns for label and score
85
+ self.grouped_url_df["label"] = None
86
+ self.grouped_url_df["score"] = None
87
+
88
+ print(f"aligned_paragraphs_df:\n {self.aligned_paragraphs_df}")
89
+
90
+ for index, row in self.grouped_url_df.iterrows():
91
+ label, score = self.verify_text(row["url"])
92
+ if label == "UNKNOWN":
93
+ # Concatenate text from "input" in sentence_df
94
+ text = " ".join(row["input"])
95
+
96
+ # detect by baseline model
97
+ label, score = detect_text_by_ai_model(text)
98
+
99
+ self.grouped_url_df.at[index, "label"] = label
100
+ self.grouped_url_df.at[index, "score"] = score
101
+
102
+ # Overall label or score for the whole input text
103
+ if len(self.grouped_url_df) > 0:
104
+ # filter self.aligned_paragraphs_df["label"] if inclucind substring MACHINE
105
+ machine_label = self.grouped_url_df[
106
+ self.grouped_url_df["label"].str.contains("MACHINE", case=False, na=False)
107
+ ]
108
+ # machine_label = self.aligned_paragraphs_df[
109
+ # self.aligned_paragraphs_df["label"] == "MACHINE"
110
+ # ]
111
+ if len(machine_label) > 0:
112
+ label = " ".join(machine_label["label"].tolist())
113
+ self.text_prediction_label[0] = label
114
+ self.text_prediction_score[0] = machine_label["score"].mean()
115
+ else:
116
+ machine_label = self.aligned_paragraphs_df[
117
+ self.aligned_paragraphs_df["label"] == "HUMAN"
118
+ ]
119
+ self.text_prediction_label[0] = "HUMAN"
120
+ self.text_prediction_score[0] = machine_label["score"].mean()
121
+ else: # no source found in the input text
122
+ print("No source found in the input text")
123
+ text = " ".join(self.aligned_paragraphs_df["input"].tolist())
124
  # detect by baseline model
125
+ label, score = detect_text_by_ai_model(text)
126
+ self.text_prediction_label[0] = label
127
+ self.text_prediction_score[0] = score
128
+
129
  def find_text_source(self):
130
  """
131
  Determines the origin of the given text based on paraphrasing detection
 
147
  # Setup df for input_sentences
148
 
149
  for _ in range(len(input_sentences)):
150
+ self.aligned_paragraphs_df = pd.concat(
151
+ [self.aligned_paragraphs_df, pd.DataFrame([{
152
  "input": None,
153
  "source": None,
154
  "label": None,
 
161
  )
162
 
163
  # find a source for each paragraph
164
+ for index, _ in enumerate(input_sentences):
165
+ similarity = self.aligned_paragraphs_df.loc[index, "similarity"]
166
+ if similarity is not None:
167
+ if similarity > PARAPHRASE_THRESHOLD_MACHINE:
168
+ continue
169
 
170
+ print(f"\n-------index = {index}-------")
171
+ print(f"current_text = {input_sentences[index]}\n")
172
 
173
+ self.aligned_paragraphs_df, img_urls = find_paragraph_source(
174
  input_sentences,
175
  index,
176
+ self.aligned_paragraphs_df,
177
  )
178
 
179
  self.found_img_url.extend(img_urls)
180
 
181
  # determine if the whole source is from a news or not
182
 
183
+ def verify_text(self, url):
184
+ label = "UNKNOWN"
185
+ score = 0
186
  # calculate the average similarity when the similary score in each row of sentences_df is higher than 0.8
187
+ filtered_by_url = self.aligned_paragraphs_df[
188
+ self.aligned_paragraphs_df["url"] == url
189
  ]
190
+ filtered_by_similarity = filtered_by_url[
191
+ filtered_by_url["similarity"] > 0.8
192
+ ]
193
+ if len(filtered_by_similarity) / len(self.aligned_paragraphs_df) > 0.5:
194
+ # check if "MACHINE" is in self.aligned_sentences_df["label"]:
195
+ contains_machine = filtered_by_similarity["label"].str.contains(
196
+ "MACHINE", case=False, na=False
197
+ ).any()
198
+ if contains_machine:
199
+ label = "MACHINE"
200
+ machine_rows = filtered_by_similarity[
201
+ filtered_by_similarity["label"].str.contains(
202
+ "MACHINE",
203
+ case=False,
204
+ na=False)
205
+ ]
206
+ generated_model, _ = predict_generation_model(self.news_text)
207
+ label += f"<br>({generated_model})"
208
+ score = machine_rows["similarity"].mean()
209
+ else:
210
+ label = "HUMAN"
211
+ human_rows = filtered_by_similarity[
212
+ filtered_by_similarity["label"].str.contains(
213
+ "HUMAN",
214
+ case=False,
215
+ na=False)
216
+ ]
217
+ score = human_rows["similarity"].mean()
218
+
219
+ return label, score
220
+
221
 
222
  def determine_image_origin(self):
223
  print("CHECK IMAGE:")
 
227
  self.image_referent_url = None
228
  return
229
 
 
 
230
  matched_url, similarity = detect_image_from_news_image(
231
  self.news_image,
232
  self.found_img_url,
233
  )
234
  if matched_url is not None:
235
+ print(f"matched image: {matched_url}\nsimilarity: {similarity}\n")
236
  self.image_prediction_label = "HUMAN"
237
  self.image_prediction_score = similarity
238
  self.image_referent_url = matched_url
 
242
  self.news_image,
243
  )
244
  if matched_url is not None:
245
+ print(f"matched image: {matched_url}\tScore: {similarity}%\n")
246
  self.image_prediction_label = "HUMAN"
247
  self.image_prediction_score = similarity
248
  self.image_referent_url = matched_url
 
260
  self.image_prediction_score = 50
261
  self.image_referent_url = None
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def generate_analysis_report(self):
264
+ if self.news_text != "":
265
+ self.determine_text_origin()
266
+ if self.news_image != "":
267
+ self.determine_image_origin()
268
 
269
  def analyze_details(self):
270
+ self.handle_entities()
271
+ ordinary_user_table = self.create_ordinary_user_table()
272
+ fact_checker_table = self.create_fact_checker_table()
273
+ governor_table = self.create_governor_table()
274
+
275
+ return ordinary_user_table, fact_checker_table, governor_table
276
+
277
+ def handle_entities(self):
278
  entities_with_colors = []
279
+ for index, row in self.grouped_url_df.iterrows():
280
  # Get entity-words (in pair) with colors
281
  entities_with_colors = highlight_entities(
282
+ row["input"],
283
+ row["source"],
284
  )
 
285
 
286
+ #self.grouped_url_df.at[index, "entities"] = entities_with_colors # must use at
287
+
288
+ for index, paragraph in self.aligned_paragraphs_df.iterrows():
289
+ if paragraph["url"] == row["url"]:
290
+ self.aligned_paragraphs_df.at[index, "entities"] = entities_with_colors # must use at
291
 
 
292
 
293
  def get_text_urls(self):
294
  return set(self.text_referent_url)
 
335
  max_length = 30 # TODO: put this in configuration
336
  rows.append(self.format_image_fact_checker_row(max_length))
337
 
338
+ for _, row in self.aligned_paragraphs_df.iterrows():
339
+ if row["input"] == None:
340
  continue
341
+
342
+ if row["source"] == None:
343
+ equal_idx_1 = equal_idx_2 = []
344
+
345
+ else: # Get index of equal phrases in input and source sentences
346
+ equal_idx_1, equal_idx_2 = extract_equal_text(
347
+ row["input"],
348
+ row["source"],
349
+ )
 
 
 
350
 
351
  self.fact_checker_table.append(
352
  [
353
+ row,
354
  equal_idx_1,
355
  equal_idx_2,
356
+ row["entities"],
357
+ row["url"]
358
  ],
359
  )
360
+
361
+ previous_url = None
362
+ span_row = 1
363
+ for index, row in enumerate(self.fact_checker_table):
364
+ current_url = row[4]
365
+ last_url_row = False
366
+
367
+ # First row or URL change
368
+ if index == 0 or current_url != previous_url:
369
+ first_url_row = True
370
+ previous_url = current_url
371
+ # Increase counter "span_row" when the next url is the same
372
+ while index + span_row < len(self.fact_checker_table) \
373
+ and self.fact_checker_table[index + span_row][4] == current_url:
374
+ span_row += 1
375
+
376
+ else:
377
+ first_url_row = False
378
+ span_row -= 1
379
+
380
+ if span_row == 1:
381
+ last_url_row = True
382
+
383
+ formatted_row = self.format_text_fact_checker_row(row, first_url_row, last_url_row, span_row, max_length)
384
  rows.append(formatted_row)
385
 
386
  table = "\n".join(rows)
 
394
  <thead>
395
  <tr>
396
  <th>Input news</th>
397
+ <th>Source (URL in Originality)</th>
398
  <th>Forensic</th>
399
  <th>Originality</th>
400
  </tr>
 
407
  <style>
408
  """
409
 
410
+ def format_text_fact_checker_row(
411
+ self,
412
+ row,
413
+ first_url_row=True,
414
+ last_url_row=True,
415
+ span_row=1,
416
+ max_length=30,
417
+ ):
418
  entity_count = 0
419
+ if row[0]["input"] is None:
420
  return ""
421
+ if row[0]["source"] is not None: # source is not empty
422
+ if row[3] is not None:
423
+ # highlight entities
424
+ input_sentence, highlight_idx_input = apply_highlight(
425
+ row[0]["input"],
426
+ row[3],
427
+ "input",
428
+ )
429
+ source_sentence, highlight_idx_source = apply_highlight(
430
+ row[0]["source"],
431
+ row[3],
432
+ "source",
433
+ )
434
+ else:
435
+ input_sentence = row[0]["input"]
436
+ source_sentence = row[0]["source"]
437
+ highlight_idx_input = []
438
+ highlight_idx_source = []
439
+
440
+ if row[3] is not None:
441
+ entity_count = len(row[3])
442
 
443
  # Color overlapping words
444
  input_sentence = self.color_text(
 
452
  highlight_idx_source,
453
  ) # text, index of highlight words
454
 
455
+ # Replace _ to get correct formatting
456
+ # Original one having _ for correct word counting
457
  input_sentence = input_sentence.replace(
458
  "span_style",
459
  "span style",
 
466
  input_sentence = row[0]["input"]
467
  source_sentence = row[0]["source"]
468
 
469
+ url = row[0]["url"]
470
+ # Displayed label and score by url
471
+ filterby_url = self.grouped_url_df[
472
+ self.grouped_url_df["url"] == url
473
+ ]
474
+ if len(filterby_url) > 0:
475
+ label = filterby_url["label"].values[0]
476
+ score = filterby_url["score"].values[0]
477
+ else:
478
+ label = self.text_prediction_label[0]
479
+ score = self.text_prediction_score[0]
480
+
481
+ # Format displayed url
482
+
483
  short_url = self.shorten_url(url, max_length)
484
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
485
 
486
+ # Format displayed entity count
487
  entity_count_text = self.get_entity_count_text(entity_count)
488
+
489
+ border_top = "border-top: 1px solid transparent;"
490
+ border_bottom = "border-bottom: 1px solid transparent;"
491
+ if first_url_row is True:
492
+ # First & Last the group: no transparent
493
+ if last_url_row is True:
494
+ return f"""
495
+ <tr>
496
+ <td>{input_sentence}</td>
497
+ <td>{source_sentence}</td>
498
+ <td rowspan="{span_row}">{label}<br>
499
+ ({score * 100:.2f}%)<br><br>
500
+ {entity_count_text}</td>
501
+ <td rowspan="{span_row}">{source_text_url}</td>
502
+ </tr>
503
+ """
504
+ # First row of the group: transparent bottom border
505
+ return f"""
506
+ <tr>
507
+ <td style="{border_bottom}";>{input_sentence}</td>
508
+ <td style="{border_bottom}";>{source_sentence}</td>
509
+ <td rowspan="{span_row}">{label}<br>
510
+ ({score * 100:.2f}%)<br><br>
511
+ {entity_count_text}</td>
512
+ <td rowspan="{span_row}">{source_text_url}</td>
513
+ </tr>
514
+ """
515
+ else:
516
+ if last_url_row is True:
517
+ # NOT First row, Last row: transparent top border
518
+ return f"""
519
+ <tr>
520
+ <td style="{border_top}";>{input_sentence}</td>
521
+ <td style="{border_top}";>{source_sentence}</td>
522
+ </tr>
523
+ """
524
+ else:
525
+ # NOT First & NOT Last row: transparent top & bottom borders
526
+ return f"""
527
+ <tr>
528
+ <td style="{border_top} {border_bottom}";>{input_sentence}</td>
529
+ <td style="{border_top} {border_bottom}";>{source_sentence}</td>
530
+ </tr>
531
+ """
532
 
533
  def format_image_fact_checker_row(self, max_length=30):
534
 
 
536
  self.image_referent_url is not None
537
  or self.image_referent_url != ""
538
  ):
539
+ source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">""" # noqa: E501
540
  short_url = self.shorten_url(self.image_referent_url, max_length)
541
  source_image_url = (
542
  f"""<a href="{self.image_referent_url}">{short_url}</a>"""
 
558
  <h5>Comparison between input news and source news:</h5>
559
  <table border="1" style="width:100%; text-align:left;">
560
  <col style="width: 170px;">
 
561
  <col style="width: 30px;">
562
  <col style="width: 75px;">
563
  <thead>
 
578
  def format_text_ordinary_user_row(self, max_length=30):
579
  input_sentences = ""
580
  source_text_urls = ""
581
+ urls = []
582
+ for _, row in self.aligned_paragraphs_df.iterrows():
583
+ if row["input"] == None:
 
 
584
  continue
585
  input_sentences += row["input"] + "<br><br>"
586
+ url = row["url"]
587
+ if url not in urls:
588
+ urls.append(url)
589
+ short_url = self.shorten_url(url, max_length)
590
+ source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
 
 
 
591
 
592
  return f"""
593
  <tr>
594
  <td>{input_sentences}</td>
595
+ <td>{self.text_prediction_label[0]}<br>
596
+ ({self.text_prediction_score[0] * 100:.2f}%)</td>
597
  <td>{source_text_urls}</td>
598
  </tr>
599
  """
 
619
  max_length = 30 # TODO: put this in configuration
620
  rows.append(self.format_image_governor_row(max_length))
621
 
622
+ for _, row in self.aligned_paragraphs_df.iterrows():
623
+ if row["input"] == None:
624
  continue
625
+
626
+ if row["source"] == None:
627
+ equal_idx_1 = equal_idx_2 = []
628
+
629
+ else:
630
+ # Get index of equal phrases in input and source sentences
631
+ equal_idx_1, equal_idx_2 = extract_equal_text(
632
+ row["input"],
633
+ row["source"],
634
+ )
 
 
635
 
636
  self.governor_table.append(
637
  [
638
+ row,
639
  equal_idx_1,
640
  equal_idx_2,
641
+ row["entities"],
642
  ],
643
  )
644
 
 
656
  <thead>
657
  <tr>
658
  <th>Input news</th>
659
+ <th>Source (URL in Originality)</th>
660
  <th>Forensic</th>
661
  <th>Originality</th>
662
  </tr>
 
673
  input_sentences = ""
674
  source_sentences = ""
675
  source_text_urls = ""
676
+ urls = []
677
  sentence_count = 0
678
+ entity_count = [0, 0] # to get index of [-2]
679
  for row in self.governor_table:
680
+ if row[0]["input"] is None:
 
681
  continue
682
 
683
+ if row[0]["source"] is not None and row[3] is not None: # source is not empty
684
  # highlight entities
685
  input_sentence, highlight_idx_input = apply_highlight(
686
  row[0]["input"],
687
+ row[3], # entities_with_colors
688
+ "input", # key
689
+ entity_count[-2], # since the last one is for current counting
690
  )
691
  source_sentence, highlight_idx_source = apply_highlight(
692
  row[0]["source"],
693
+ row[3], # entities_with_colors
694
+ "source", # key
695
+ entity_count[-2], # since the last one is for current counting
696
  )
 
697
 
698
  # Color overlapping words
699
  input_sentence = self.color_text(
 
717
  ).replace("1px_4px", "1px 4px")
718
 
719
  else:
720
+ if row[0]["source"] is None:
721
+ source_sentence = ""
722
+ else:
723
+ source_sentence = row[0]["source"]
724
  input_sentence = row[0]["input"]
725
+
726
 
727
  # convert score to HUMAN-based score:
728
  input_sentences += input_sentence + "<br><br>"
729
  source_sentences += source_sentence + "<br><br>"
730
+
731
  url = row[0]["url"]
732
+ if url not in urls:
733
+ urls.append(url)
734
+ short_url = self.shorten_url(url, max_length)
735
+ source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
736
+ sentence_count += 1
737
+ if row[3] is not None:
738
+ entity_count.append(len(row[3]))
739
+
740
+ entity_count_text = self.get_entity_count_text(sum(entity_count))
741
 
742
  return f"""
743
  <tr>
744
  <td>{input_sentences}</td>
745
  <td>{source_sentences}</td>
746
+ <td>{self.text_prediction_label[0]}<br>
747
+ ({self.text_prediction_score[0] * 100:.2f}%)<br><br>
748
+ {entity_count_text}</td>
749
  <td>{source_text_urls}</td>
750
  </tr>
751
  """
 
755
  self.image_referent_url is not None
756
  or self.image_referent_url != ""
757
  ):
758
+ source_image = f"""<img src="{self.image_referent_url}" width="100" height="150">""" # noqa: E501
759
  short_url = self.shorten_url(self.image_referent_url, max_length)
760
  source_image_url = (
761
  f"""<a href="{self.image_referent_url}">{short_url}</a>"""
 
770
  if entity_count <= 0:
771
  entity_count_text = ""
772
  elif entity_count == 1:
773
+ entity_count_text = "with 1 altered entity"
774
  else:
775
  entity_count_text = "with altered entities"
776
  return entity_count_text
 
791
 
792
  starts, ends = self.extract_starts_ends(colored_idx)
793
  starts, ends = self.filter_indices(starts, ends, highlighted_idx)
794
+
795
  previous_end = 0
796
  for start, end in zip(starts, ends):
797
  paragraph += " ".join(words[previous_end:start])
 
801
 
802
  previous_end = end
803
 
804
+ paragraph += " ".join(words[previous_end:])
 
 
 
 
 
 
805
 
806
  return paragraph
807
 
 
884
  end = number
885
  else:
886
  starts.append(start)
887
+ ends.append(end)
888
  start = number
889
  end = number
890
 
891
  if i == len(numbers) - 1:
892
  starts.append(start)
893
+ ends.append(end)
 
 
894
 
895
+ return starts, ends
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/application/text/entity.py CHANGED
@@ -161,16 +161,17 @@ def assign_colors_to_entities(entities):
161
 
162
 
163
  def highlight_entities(text1, text2):
164
- if text1 == "" or text2 == "":
165
- return []
166
 
167
  entities_text = extract_entities_gpt(text1, text2)
168
- print(f"entities_text: {entities_text}")
169
 
170
  # Clean up entities: remove wrapping characters
171
  entities_text = entities_text.replace("```json", "").replace("```", "")
172
 
173
  entities = read_json(entities_text)
 
 
174
 
175
  # Assign colors to entities
176
  entities_with_colors = assign_colors_to_entities(entities)
@@ -179,7 +180,7 @@ def highlight_entities(text1, text2):
179
 
180
 
181
  def apply_highlight(text, entities_with_colors, key="input", count=0):
182
- if entities_with_colors == []:
183
  return text, []
184
 
185
  all_starts = []
 
161
 
162
 
163
  def highlight_entities(text1, text2):
164
+ if text1 == None or text2 == None:
165
+ return None
166
 
167
  entities_text = extract_entities_gpt(text1, text2)
 
168
 
169
  # Clean up entities: remove wrapping characters
170
  entities_text = entities_text.replace("```json", "").replace("```", "")
171
 
172
  entities = read_json(entities_text)
173
+ if len(entities) == 0:
174
+ return None
175
 
176
  # Assign colors to entities
177
  entities_with_colors = assign_colors_to_entities(entities)
 
180
 
181
 
182
  def apply_highlight(text, entities_with_colors, key="input", count=0):
183
+ if entities_with_colors is None:
184
  return text, []
185
 
186
  all_starts = []
src/application/text/helper.py CHANGED
@@ -147,7 +147,7 @@ def extract_equal_text(text1, text2):
147
  text = text.lower()
148
  text = text.translate(str.maketrans("", "", string.punctuation))
149
  return text
150
-
151
  splited_text1 = cleanup(text1).split()
152
  splited_text2 = cleanup(text2).split()
153
 
@@ -163,8 +163,7 @@ def extract_equal_text(text1, text2):
163
  equal_idx_2.append({"start": j1, "end": j2})
164
  # subtext_1 = " ".join(text1[i1:i2])
165
  # subtext_2 = " ".join(text2[j1:j2])
166
- # print(f'{tag:7} a[{i1:2}:{i2:2}]
167
- # --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
168
  return equal_idx_1, equal_idx_2
169
 
170
 
 
147
  text = text.lower()
148
  text = text.translate(str.maketrans("", "", string.punctuation))
149
  return text
150
+
151
  splited_text1 = cleanup(text1).split()
152
  splited_text2 = cleanup(text2).split()
153
 
 
163
  equal_idx_2.append({"start": j1, "end": j2})
164
  # subtext_1 = " ".join(text1[i1:i2])
165
  # subtext_2 = " ".join(text2[j1:j2])
166
+ # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
 
167
  return equal_idx_1, equal_idx_2
168
 
169
 
src/application/text/model_detection.py CHANGED
@@ -1,24 +1,48 @@
1
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # TODO: move to a config file
4
- DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
 
5
 
6
- MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"}
7
  HUMAN = "HUMAN"
8
  MACHINE = "MACHINE"
9
  UNKNOWN = "UNKNOWN"
10
  PARAPHRASE = "PARAPHRASE"
11
  NON_PARAPHRASE = "NON_PARAPHRASE"
12
 
 
 
 
 
 
13
 
14
  def detect_text_by_ai_model(
15
  input_text: str,
16
- model: str = DEFAULT_MODEL,
17
  max_length: int = 512,
18
  ) -> tuple:
19
  """
20
- Model: chatgpt_detector_roberta
21
- Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
22
 
23
  Detects if text is human or machine generated.
24
 
@@ -42,7 +66,89 @@ def detect_text_by_ai_model(
42
  label = HUMAN
43
  else:
44
  label = MACHINE
 
 
45
  return label, confidence_score
46
  except Exception as e: # Add exception handling
47
  print(f"Error in Roberta model inference: {e}")
48
  return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import pipeline
2
+ import os
3
+
4
+ from dotenv import load_dotenv
5
+ from openai import AzureOpenAI, OpenAIError
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import torch
8
+
9
+
10
+ load_dotenv()
11
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
12
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
13
+ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
14
+
15
+ azure_client = AzureOpenAI(
16
+ azure_endpoint="https://quoc-nguyen.openai.azure.com/",
17
+ api_key=AZURE_OPENAI_API_KEY,
18
+ api_version="2024-05-01-preview",
19
+ )
20
 
21
  # TODO: move to a config file
22
+ # AI_TEXT_DECTECTION_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta"
23
+ AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
24
 
25
+ MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"}
26
  HUMAN = "HUMAN"
27
  MACHINE = "MACHINE"
28
  UNKNOWN = "UNKNOWN"
29
  PARAPHRASE = "PARAPHRASE"
30
  NON_PARAPHRASE = "NON_PARAPHRASE"
31
 
32
+ # load the embedding model
33
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
35
+ PARAPHASE_MODEL.to(DEVICE)
36
+
37
 
38
  def detect_text_by_ai_model(
39
  input_text: str,
40
+ model: str = AI_TEXT_DECTECTION_MODEL,
41
  max_length: int = 512,
42
  ) -> tuple:
43
  """
44
+ Model: RADAR-Vicuna-7B
45
+ Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B
46
 
47
  Detects if text is human or machine generated.
48
 
 
66
  label = HUMAN
67
  else:
68
  label = MACHINE
69
+ generated_model, _ = predict_generation_model(input_text)
70
+ label += f"<br>({generated_model})"
71
  return label, confidence_score
72
  except Exception as e: # Add exception handling
73
  print(f"Error in Roberta model inference: {e}")
74
  return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
75
+
76
+
77
+ def predict_generation_model(text:str) -> tuple[str, float]:
78
+ """
79
+ Predicts if text is generated by gpt-4o or gpt-4o-mini models.
80
+ Compare the input text against the paraphrased text by the models.
81
+
82
+ Returns:
83
+ tuple: (label, confidence_score)
84
+ where label is gpt-4o or gpt-4o-mini.
85
+ """
86
+ best_similarity = 0
87
+ best_model = "gpt-4o"
88
+ models = ["gpt-4o", "gpt-4o-mini"]
89
+ for model in models:
90
+ paraphrased_text = paraphrase_by_AI(text, model)
91
+ if paraphrased_text is None:
92
+ continue
93
+ similarity = measure_text_similarity(text, paraphrased_text)
94
+ if similarity > best_similarity:
95
+ best_similarity = similarity
96
+ best_model = model
97
+
98
+ return best_model, best_similarity
99
+
100
+
101
+ def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
102
+ """
103
+ Paraphrase text using a given model.
104
+
105
+ Returns:
106
+ str: Paraphrased text.
107
+ """
108
+
109
+ prompt = f"""
110
+ Paraphrase the following news, only output the paraphrased text:
111
+ {input_text}
112
+ """
113
+ try:
114
+ response = azure_client.chat.completions.create(
115
+ model=model,
116
+ messages=[
117
+ {"role": "user", "content": prompt},
118
+ ],
119
+ # max_tokens=100,
120
+ # temperature=0.7,
121
+ # top_p=0.9,
122
+ # n=1,
123
+ )
124
+ paraphrased_text = response.choices[0].message.content
125
+ return paraphrased_text
126
+ except OpenAIError as e: # Add exception handling
127
+ print(f"Error in AI model inference: {e}")
128
+ return None
129
+
130
+ def measure_text_similarity(text1: str, text2: str) -> float:
131
+ """
132
+ Measure the similarity between two texts.
133
+
134
+ Returns:
135
+ float: Similarity score.
136
+ """
137
+ embeddings1 = PARAPHASE_MODEL.encode(
138
+ text1,
139
+ convert_to_tensor=True,
140
+ device=DEVICE,
141
+ show_progress_bar=False,
142
+ )
143
+ embeddings2 = PARAPHASE_MODEL.encode(
144
+ text2,
145
+ convert_to_tensor=True,
146
+ device=DEVICE,
147
+ show_progress_bar=False,
148
+ )
149
+
150
+ # Compute cosine similarity matrix
151
+ similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
152
+ print(similarity[0][0])
153
+ return similarity[0][0]
154
+
src/application/text/search.py CHANGED
@@ -174,7 +174,6 @@ def generate_search_phrases(input_text):
174
  # Method 4: Get most identities and key words
175
  entities = extract_entities(input_text)
176
  text_without_entities = remove_identities_from_text(input_text, entities)
177
- print(f"text_without_entities: {text_without_entities}")
178
  search_phrases.append(text_without_entities)
179
  # keywords = get_keywords(input_text, 16)
180
  # search_phrase = " ".join(entities) + " " + " ".join(keywords)
 
174
  # Method 4: Get most identities and key words
175
  entities = extract_entities(input_text)
176
  text_without_entities = remove_identities_from_text(input_text, entities)
 
177
  search_phrases.append(text_without_entities)
178
  # keywords = get_keywords(input_text, 16)
179
  # search_phrase = " ".join(entities) + " " + " ".join(keywords)
src/application/text/search_detection.py CHANGED
@@ -1,17 +1,14 @@
1
- import string
2
  import warnings
3
  from difflib import SequenceMatcher
4
 
5
  import nltk
6
  import numpy as np
7
- import pandas as pd
8
  import torch
9
  from sentence_transformers import (
10
  SentenceTransformer,
11
  util,
12
  )
13
 
14
- from src.application.text.helper import extract_equal_text
15
  from src.application.text.preprocessing import split_into_paragraphs
16
  from src.application.text.search import (
17
  generate_search_phrases,
@@ -41,102 +38,11 @@ MIN_RATIO_PARAPHRASE_NUM = 0.5
41
  MAX_CHAR_SIZE = 30000
42
 
43
 
44
- def detect_text_by_relative_search(
45
- input_text,
46
- index,
47
- is_support_opposite=False,
48
- ):
49
- checked_urls = set()
50
- searched_phrases = generate_search_phrases(input_text[index])
51
-
52
- for candidate in searched_phrases:
53
- search_results = search_by_google(candidate)
54
- urls = [item["link"] for item in search_results.get("items", [])]
55
-
56
- for url in urls[:3]:
57
- if url in checked_urls: # visited url
58
- continue
59
- if "bbc.com" not in url:
60
- continue
61
-
62
- checked_urls.add(url)
63
- print(f"\t\tChecking URL: {url}")
64
-
65
- content = URLReader(url)
66
-
67
- if content.is_extracted is True:
68
- if content.title is None or content.text is None:
69
- print("\t\t\t↑↑↑ Title or text not found")
70
- continue
71
-
72
- page_text = content.title + "\n" + content.text
73
- if len(page_text) > MAX_CHAR_SIZE:
74
- print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
75
- continue
76
- print(f"\t\t\t↑↑↑ Title: {content.title}")
77
- aligned_first_sentences = check_paraphrase(
78
- input_text[index],
79
- page_text,
80
- url,
81
- )
82
- is_paraphrased = aligned_first_sentences["is_paraphrased"]
83
-
84
- if is_paraphrased is False:
85
- return (
86
- is_paraphrased,
87
- url,
88
- aligned_first_sentences,
89
- content.images,
90
- index,
91
- )
92
-
93
- sub_paraphrase = True
94
- while sub_paraphrase is True:
95
- index += 1
96
- print(f"----search {index} < {len(input_text)}----")
97
- if index >= len(input_text):
98
- print(f"input_text_last: {input_text[-1]}")
99
- break
100
- print(f"input_text: {input_text[index]}")
101
- sub_sentences = check_paraphrase(
102
- input_text[index],
103
- page_text,
104
- url,
105
- )
106
- sub_paraphrase = sub_sentences["is_paraphrased"]
107
- print(f"sub_paraphrase: {sub_paraphrase}")
108
- print(f"sub_sentences: {sub_sentences}")
109
- if sub_paraphrase is True:
110
- aligned_first_sentences["input"] += (
111
- "<br>" + sub_sentences["input"]
112
- )
113
- aligned_first_sentences["source"] += (
114
- "<br>" + sub_sentences["source"]
115
- )
116
- aligned_first_sentences["similarity"] += sub_sentences[
117
- "similarity"
118
- ]
119
- aligned_first_sentences["similarity"] /= 2
120
-
121
- print(f"paraphrase: {is_paraphrased}")
122
- print(f"aligned_first_sentences: {aligned_first_sentences}")
123
- return (
124
- is_paraphrased,
125
- url,
126
- aligned_first_sentences,
127
- content.images,
128
- index,
129
- )
130
-
131
- return False, None, [], [], index
132
-
133
-
134
  def find_paragraph_source(text, text_index, sentences_df):
135
 
136
  checked_urls = set()
137
  searched_phrases = generate_search_phrases(text[text_index])
138
- print(f"text[text_index]: {text[text_index]}")
139
- print(f"searched_phrases: {searched_phrases}")
140
  for candidate in searched_phrases:
141
  search_results = search_by_google(candidate)
142
  urls = [item["link"] for item in search_results.get("items", [])]
@@ -169,11 +75,10 @@ def find_paragraph_source(text, text_index, sentences_df):
169
  )
170
 
171
  if aligned_sentence["paraphrase"] is False:
172
- print(f'sentence_1: {sentences_df.loc[text_index, "input"]}')
173
- print(f'sentence_2: {aligned_sentence["input"]}')
174
  sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
175
  sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
176
  return sentences_df, []
 
177
  # assign values
178
  columns = [
179
  "input",
@@ -187,32 +92,29 @@ def find_paragraph_source(text, text_index, sentences_df):
187
  if c in sentences_df.columns:
188
  sentences_df.loc[text_index, c] = aligned_sentence[c]
189
 
190
-
191
- print(f"sen: {sentences_df}")
192
- for idx, _ in enumerate(sentences_df):
193
- print(f"{idx}")
194
- if idx > len(sentences_df):
195
- break
196
- if sentences_df.loc[idx, "url"] is not None:
197
- continue
198
-
199
- # find content in new url
200
  aligned_sentence = check_paraphrase(
201
  text[idx],
202
  page_text,
203
  url,
204
  )
205
 
206
- if aligned_sentence["url"] is None:
207
- continue
208
-
209
- columns = ["input", "source", "label", "similarity", "url"]
210
- for c in columns:
211
- if c in sentences_df.columns:
212
- sentences_df.loc[text_index, c] = aligned_sentence[c]
213
-
214
  return sentences_df, content.images
215
 
 
216
  return sentences_df, []
217
 
218
 
@@ -344,11 +246,13 @@ def check_paraphrase(input_text, page_text, url):
344
  input_paragraphs,
345
  convert_to_tensor=True,
346
  device=DEVICE,
 
347
  )
348
  embeddings2 = PARAPHASE_MODEL.encode(
349
  page_paragraphs,
350
  convert_to_tensor=True,
351
  device=DEVICE,
 
352
  )
353
 
354
  # Compute cosine similarity matrix
@@ -361,12 +265,7 @@ def check_paraphrase(input_text, page_text, url):
361
  max_similarity = similarity_matrix[i][max_sim_index]
362
 
363
  label, is_paraphrased = determine_label(max_similarity)
364
- print(f"is_paraphrased: {is_paraphrased}")
365
- if is_paraphrased is False:
366
- url = None
367
- best_matched_paragraph = None
368
- else:
369
- best_matched_paragraph = page_paragraphs[max_sim_index]
370
 
371
  alignment = {
372
  "input": paragraph,
@@ -376,6 +275,7 @@ def check_paraphrase(input_text, page_text, url):
376
  "paraphrase": is_paraphrased,
377
  "url": url,
378
  }
 
379
 
380
  return alignment
381
 
@@ -423,7 +323,7 @@ def determine_label(similarity):
423
  elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
424
  return "MACHINE", True
425
  else:
426
- return "", False
427
 
428
 
429
  if __name__ == "__main__":
 
 
1
  import warnings
2
  from difflib import SequenceMatcher
3
 
4
  import nltk
5
  import numpy as np
 
6
  import torch
7
  from sentence_transformers import (
8
  SentenceTransformer,
9
  util,
10
  )
11
 
 
12
  from src.application.text.preprocessing import split_into_paragraphs
13
  from src.application.text.search import (
14
  generate_search_phrases,
 
38
  MAX_CHAR_SIZE = 30000
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def find_paragraph_source(text, text_index, sentences_df):
42
 
43
  checked_urls = set()
44
  searched_phrases = generate_search_phrases(text[text_index])
45
+
 
46
  for candidate in searched_phrases:
47
  search_results = search_by_google(candidate)
48
  urls = [item["link"] for item in search_results.get("items", [])]
 
75
  )
76
 
77
  if aligned_sentence["paraphrase"] is False:
 
 
78
  sentences_df.loc[text_index, "input"] = aligned_sentence["input"]
79
  sentences_df.loc[text_index, "paraphrase"] = aligned_sentence["paraphrase"]
80
  return sentences_df, []
81
+
82
  # assign values
83
  columns = [
84
  "input",
 
92
  if c in sentences_df.columns:
93
  sentences_df.loc[text_index, c] = aligned_sentence[c]
94
 
95
+
96
+ for idx, _ in sentences_df.iterrows():
97
+ similarity = sentences_df.loc[idx, "similarity"]
98
+ if similarity is not None:
99
+ if similarity > PARAPHRASE_THRESHOLD_MACHINE:
100
+ continue
101
+
102
+ # find matched content in new url
 
 
103
  aligned_sentence = check_paraphrase(
104
  text[idx],
105
  page_text,
106
  url,
107
  )
108
 
109
+ if similarity is None or \
110
+ aligned_sentence["similarity"] > similarity:
111
+ columns = ["input", "source", "label", "similarity", "url"]
112
+ for c in columns:
113
+ if c in sentences_df.columns:
114
+ sentences_df.loc[idx, c] = aligned_sentence[c]
 
 
115
  return sentences_df, content.images
116
 
117
+ sentences_df.loc[text_index, "input"] = text[text_index]
118
  return sentences_df, []
119
 
120
 
 
246
  input_paragraphs,
247
  convert_to_tensor=True,
248
  device=DEVICE,
249
+ show_progress_bar=False,
250
  )
251
  embeddings2 = PARAPHASE_MODEL.encode(
252
  page_paragraphs,
253
  convert_to_tensor=True,
254
  device=DEVICE,
255
+ show_progress_bar=False,
256
  )
257
 
258
  # Compute cosine similarity matrix
 
265
  max_similarity = similarity_matrix[i][max_sim_index]
266
 
267
  label, is_paraphrased = determine_label(max_similarity)
268
+ best_matched_paragraph = page_paragraphs[max_sim_index]
 
 
 
 
 
269
 
270
  alignment = {
271
  "input": paragraph,
 
275
  "paraphrase": is_paraphrased,
276
  "url": url,
277
  }
278
+ print(f"Result: [{alignment["similarity"]}] {alignment["source"]}")
279
 
280
  return alignment
281
 
 
323
  elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
324
  return "MACHINE", True
325
  else:
326
+ return None, False
327
 
328
 
329
  if __name__ == "__main__":
test.py CHANGED
@@ -1,13 +1,2 @@
1
- import numpy as np
2
- import pandas as pd
3
-
4
- # Create an empty DataFrame with 5 columns
5
- df = pd.DataFrame(columns=['col1', 'col2', 'col3', 'col4', 'col5']) # Or any column names you want
6
-
7
- # Method 1: Using a dictionary and append (less efficient for large DataFrames)
8
- for _ in range(5): # Add 5 rows
9
- df = pd.concat([df, pd.DataFrame([{'col1': np.nan, 'col2': np.nan, 'col3': np.nan, 'col4': np.nan, 'col5': np.nan}])], ignore_index=True)
10
- d = {"col1": "ta", "col2": "gs"}
11
- df.loc[1, "col1"] = d["col1"]
12
- for index, row in enumerate(df):
13
- print(index)
 
1
+ my_list = [0, 0]
2
+ print(my_list[-2])