pmkhanh7890 commited on
Commit
00b1038
·
1 Parent(s): e58707f

refactor code + fix bug of label after grouping url

Browse files
application_2.py DELETED
File without changes
gpt_test.py CHANGED
@@ -91,9 +91,10 @@ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
91
  AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
92
 
93
  azure_client = AzureOpenAI(
94
- azure_endpoint="https://quoc-nguyen.openai.azure.com/",
95
  api_key=AZURE_OPENAI_API_KEY,
96
- api_version="2024-05-01-preview",
 
97
  )
98
 
99
  deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
@@ -127,4 +128,4 @@ for index, news in enumerate(text):
127
  count += 1
128
  paraphrased_news = response.choices[0].message.content
129
 
130
- add_text_to_csv("data/MAGE_2_4o.csv", paraphrased_news, count)
 
91
  AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
92
 
93
  azure_client = AzureOpenAI(
94
+ azure_endpoint=AZURE_OPENAI_ENDPOINT,
95
  api_key=AZURE_OPENAI_API_KEY,
96
+ api_version=AZURE_OPENAI_API_VERSION,
97
+ # API VERSION=[ 2024-12-01-preview, 2024-05-01-preview]
98
  )
99
 
100
  deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
 
128
  count += 1
129
  paraphrased_news = response.choices[0].message.content
130
 
131
+ add_text_to_csv("data/test.csv", paraphrased_news, count)
requirements.txt CHANGED
@@ -17,8 +17,10 @@ scikit-learn
17
  nltk
18
  numpy
19
  torch
 
20
  sentence-transformers
21
  accelerate
 
22
 
23
  # Images
24
  pillow==10.1.0
 
17
  nltk
18
  numpy
19
  torch
20
+ tokenizers
21
  sentence-transformers
22
  accelerate
23
+ sentencepiece
24
 
25
  # Images
26
  pillow==10.1.0
src/application/config.py CHANGED
@@ -43,6 +43,9 @@ PARAPHRASE_MODEL.to(DEVICE)
43
  # Model to detect AI-generated text
44
  AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
45
 
 
 
 
46
  # Thresholds
47
  PARAPHRASE_THRESHOLD_HUMAN = 0.963
48
  PARAPHRASE_THRESHOLD_MACHINE = 0.8
@@ -89,3 +92,6 @@ ENTITY_BRIGHTNESS = 0.75 # color's brightness.
89
 
90
  # HTML formatting
91
  WORD_BREAK = "word-break: break-all;"
 
 
 
 
43
  # Model to detect AI-generated text
44
  AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
45
 
46
+ # Model to classify AI-generated text
47
+ AI_TEXT_CLASSIFICATION_MODEL = "ductuan024/gpts-detector"
48
+
49
  # Thresholds
50
  PARAPHRASE_THRESHOLD_HUMAN = 0.963
51
  PARAPHRASE_THRESHOLD_MACHINE = 0.8
 
92
 
93
  # HTML formatting
94
  WORD_BREAK = "word-break: break-all;"
95
+
96
+ # Prefix for output MACHINE label of text
97
+ PREFIX = "Partially generated by "
src/application/content_detection.py CHANGED
@@ -74,8 +74,7 @@ class NewsVerification:
74
  news_image (str): The url of image in news article.
75
  """
76
  # Combine title and content for a full text representation.
77
- # .strip() removes leading/trailing whitespace for cleaner text.
78
- self.news_text = (news_title + "\n\n" + news_content).strip()
79
 
80
  # if not isinstance(news_title, str) or not isinstance(
81
  # news_content,
@@ -90,6 +89,9 @@ class NewsVerification:
90
  self.news_content = news_content
91
  self.news_image = news_image
92
 
 
 
 
93
  def group_by_url(self):
94
  """
95
  Groups aligned sentences by URL
@@ -138,6 +140,7 @@ class NewsVerification:
138
  # Detect text origin using an AI model.
139
  label, score = detect_text_by_ai_model(text)
140
 
 
141
  self.text.grouped_url_df.at[index, "label"] = label
142
  self.text.grouped_url_df.at[index, "score"] = score
143
 
@@ -169,6 +172,7 @@ class NewsVerification:
169
  na=False,
170
  )
171
  ]
 
172
 
173
  if not machine_label.empty:
174
  # If 'gpt-4o' labels are found, post-process and assign.
@@ -185,7 +189,9 @@ class NewsVerification:
185
  self.aligned_sentences_df["label"] == "HUMAN"
186
  ]
187
  self.text.prediction_label[0] = "HUMAN"
188
- self.text.prediction_score[0] = machine_label["score"].mean()
 
 
189
  else:
190
  # If no found URLs, use AI detection on the entire input text.
191
  print("No source found in the input text")
@@ -306,7 +312,7 @@ class NewsVerification:
306
 
307
  # Check if a ratio of remaining filtering-sentences is more than 50%.
308
  if (
309
- len(filtered_by_similarity) / len(self.aligned_sentences_df)
310
  > MIN_RATIO_PARAPHRASE_NUM
311
  ):
312
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
@@ -319,6 +325,7 @@ class NewsVerification:
319
  )
320
  .any()
321
  )
 
322
 
323
  # TODO: integrate with determine_text_origin
324
  if contains_machine:
 
74
  news_image (str): The url of image in news article.
75
  """
76
  # Combine title and content for a full text representation.
77
+ self.news_text = news_title + "\n\n" + news_content
 
78
 
79
  # if not isinstance(news_title, str) or not isinstance(
80
  # news_content,
 
89
  self.news_content = news_content
90
  self.news_image = news_image
91
 
92
+ self.text.input = self.news_text
93
+ self.image.input = news_image
94
+
95
  def group_by_url(self):
96
  """
97
  Groups aligned sentences by URL
 
140
  # Detect text origin using an AI model.
141
  label, score = detect_text_by_ai_model(text)
142
 
143
+ print(f"labels = {label}")
144
  self.text.grouped_url_df.at[index, "label"] = label
145
  self.text.grouped_url_df.at[index, "score"] = score
146
 
 
172
  na=False,
173
  )
174
  ]
175
+ print(f" machine_label = {machine_label}")
176
 
177
  if not machine_label.empty:
178
  # If 'gpt-4o' labels are found, post-process and assign.
 
189
  self.aligned_sentences_df["label"] == "HUMAN"
190
  ]
191
  self.text.prediction_label[0] = "HUMAN"
192
+ self.text.prediction_score[0] = self.text.grouped_url_df[
193
+ "score"
194
+ ].mean()
195
  else:
196
  # If no found URLs, use AI detection on the entire input text.
197
  print("No source found in the input text")
 
312
 
313
  # Check if a ratio of remaining filtering-sentences is more than 50%.
314
  if (
315
+ len(filtered_by_similarity) / len(filtered_by_url)
316
  > MIN_RATIO_PARAPHRASE_NUM
317
  ):
318
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
 
325
  )
326
  .any()
327
  )
328
+ print(f"contain_machine = \n{contains_machine}")
329
 
330
  # TODO: integrate with determine_text_origin
331
  if contains_machine:
src/application/content_generation.py CHANGED
@@ -86,7 +86,7 @@ def extract_title_content(fake_news: str) -> tuple[str, str]:
86
  title_start = fake_news.find("# Title: ") + len("# Title: ")
87
  title_end = fake_news.find("\n", title_start)
88
  if title_start != -1 and title_end != -1:
89
- title = fake_news[title_start:title_end].strip()
90
 
91
  title_start = fake_news.find("\n# Content: ") + len(
92
  "\n# Content: ",
 
86
  title_start = fake_news.find("# Title: ") + len("# Title: ")
87
  title_end = fake_news.find("\n", title_start)
88
  if title_start != -1 and title_end != -1:
89
+ title = fake_news[title_start:title_end] # .strip()
90
 
91
  title_start = fake_news.find("\n# Content: ") + len(
92
  "\n# Content: ",
src/application/formatting_fact_checker.py CHANGED
@@ -7,7 +7,10 @@ from src.application.formatting import (
7
  )
8
  from src.application.image.image import ImageDetector
9
  from src.application.text.entity import apply_highlight
10
- from src.application.text.helper import extract_equal_text
 
 
 
11
  from src.application.text.text import TextDetector
12
 
13
 
@@ -17,63 +20,66 @@ def create_fact_checker_table(
17
  image: ImageDetector,
18
  ):
19
  rows = []
20
- rows.append(format_image_fact_checker_row(image))
21
-
22
- for _, row in aligned_sentences_df.iterrows():
23
- if row["input"] is None:
24
- continue
25
-
26
- if row["source"] is None:
27
- equal_idx_1 = equal_idx_2 = []
28
-
29
- else: # Get index of equal phrases in input and source sentences
30
- equal_idx_1, equal_idx_2 = extract_equal_text(
31
- row["input"],
32
- row["source"],
 
 
 
 
 
 
 
 
 
 
 
 
33
  )
34
 
35
- text.fact_checker_table.append(
36
- [
37
- row, # aligned_sentences_df
38
- equal_idx_1, # index of equal text in input
39
- equal_idx_2, # index of equal text in source
40
- row["entities"],
41
- row["url"],
42
- ],
43
- )
44
-
45
- previous_url = None
46
- span_row = 1
47
- for index, row in enumerate(text.fact_checker_table):
48
- current_url = row[4]
49
- last_url_row = False
50
-
51
- # First row or URL change
52
- if index == 0 or current_url != previous_url:
53
- first_url_row = True
54
- previous_url = current_url
55
- # Increase counter "span_row" when the next url is the same
56
- while (
57
- index + span_row < len(text.fact_checker_table)
58
- and text.fact_checker_table[index + span_row][4] == current_url
59
- ):
60
- span_row += 1
61
-
62
- else:
63
- first_url_row = False
64
- span_row -= 1
65
-
66
- if span_row == 1:
67
- last_url_row = True
68
-
69
- formatted_row = format_text_fact_checker_row(
70
- text,
71
- row,
72
- first_url_row,
73
- last_url_row,
74
- span_row,
75
- )
76
- rows.append(formatted_row)
77
 
78
  table = "\n".join(rows)
79
  return f"""
@@ -102,9 +108,9 @@ def create_fact_checker_table(
102
  def format_text_fact_checker_row(
103
  text: TextDetector,
104
  row: list,
105
- first_url_row: bool=True,
106
- last_url_row: bool=True,
107
- span_row: int=1,
108
  ):
109
  entity_count = 0
110
  print(f"row: {row}")
@@ -158,10 +164,14 @@ def format_text_fact_checker_row(
158
  input_sentence = row[0]["input"]
159
  source_sentence = row[0]["source"]
160
 
 
 
 
161
  url = row[0]["url"]
162
 
163
  # Displayed label and score by url
164
  filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
 
165
  if len(filterby_url) > 0:
166
  label = filterby_url["label"].values[0]
167
  score = filterby_url["score"].values[0]
@@ -170,7 +180,10 @@ def format_text_fact_checker_row(
170
  score = text.prediction_score[0]
171
 
172
  # Format displayed url
173
- source_text_url = f"""<a href="{url}">{url}</a>"""
 
 
 
174
 
175
  # Format displayed entity count
176
  entity_count_text = format_entity_count(entity_count)
@@ -220,7 +233,10 @@ def format_text_fact_checker_row(
220
  """
221
 
222
 
223
- def format_image_fact_checker_row(image):
 
 
 
224
  if image.referent_url is not None or image.referent_url != "":
225
  source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
226
  source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
 
7
  )
8
  from src.application.image.image import ImageDetector
9
  from src.application.text.entity import apply_highlight
10
+ from src.application.text.helper import (
11
+ extract_equal_text,
12
+ replace_leading_spaces,
13
+ )
14
  from src.application.text.text import TextDetector
15
 
16
 
 
20
  image: ImageDetector,
21
  ):
22
  rows = []
23
+ if image.input is not None:
24
+ rows.append(format_image_fact_checker_row(image))
25
+
26
+ if text.input is not None:
27
+ for _, row in aligned_sentences_df.iterrows():
28
+ if row["input"] is None:
29
+ continue
30
+
31
+ if row["source"] is None:
32
+ equal_idx_1 = equal_idx_2 = []
33
+
34
+ else: # Get index of equal phrases in input and source sentences
35
+ equal_idx_1, equal_idx_2 = extract_equal_text(
36
+ row["input"],
37
+ row["source"],
38
+ )
39
+
40
+ text.fact_checker_table.append(
41
+ [
42
+ row, # aligned_sentences_df
43
+ equal_idx_1, # index of equal text in input
44
+ equal_idx_2, # index of equal text in source
45
+ row["entities"],
46
+ row["url"],
47
+ ],
48
  )
49
 
50
+ previous_url = None
51
+ span_row = 1
52
+ for index, row in enumerate(text.fact_checker_table):
53
+ current_url = row[4]
54
+ last_url_row = False
55
+
56
+ # First row or URL change
57
+ if index == 0 or current_url != previous_url:
58
+ first_url_row = True
59
+ previous_url = current_url
60
+ # Increase counter "span_row" when the next url is the same
61
+ while (
62
+ index + span_row < len(text.fact_checker_table)
63
+ and text.fact_checker_table[index + span_row][4]
64
+ == current_url
65
+ ):
66
+ span_row += 1
67
+
68
+ else:
69
+ first_url_row = False
70
+ span_row -= 1
71
+
72
+ if span_row == 1:
73
+ last_url_row = True
74
+
75
+ formatted_row = format_text_fact_checker_row(
76
+ text,
77
+ row,
78
+ first_url_row,
79
+ last_url_row,
80
+ span_row,
81
+ )
82
+ rows.append(formatted_row)
 
 
 
 
 
 
 
 
 
83
 
84
  table = "\n".join(rows)
85
  return f"""
 
108
  def format_text_fact_checker_row(
109
  text: TextDetector,
110
  row: list,
111
+ first_url_row: bool = True,
112
+ last_url_row: bool = True,
113
+ span_row: int = 1,
114
  ):
115
  entity_count = 0
116
  print(f"row: {row}")
 
164
  input_sentence = row[0]["input"]
165
  source_sentence = row[0]["source"]
166
 
167
+ input_sentence = replace_leading_spaces(input_sentence)
168
+ source_sentence = replace_leading_spaces(source_sentence)
169
+
170
  url = row[0]["url"]
171
 
172
  # Displayed label and score by url
173
  filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
174
+
175
  if len(filterby_url) > 0:
176
  label = filterby_url["label"].values[0]
177
  score = filterby_url["score"].values[0]
 
180
  score = text.prediction_score[0]
181
 
182
  # Format displayed url
183
+ if url is None:
184
+ source_text_url = url
185
+ else:
186
+ source_text_url = f"""<a href="{url}">{url}</a>"""
187
 
188
  # Format displayed entity count
189
  entity_count_text = format_entity_count(entity_count)
 
233
  """
234
 
235
 
236
+ def format_image_fact_checker_row(image: ImageDetector):
237
+ if image.input is None:
238
+ return ""
239
+
240
  if image.referent_url is not None or image.referent_url != "":
241
  source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
242
  source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
src/application/formatting_governor.py CHANGED
@@ -7,7 +7,10 @@ from src.application.formatting import (
7
  )
8
  from src.application.image.image import ImageDetector
9
  from src.application.text.entity import apply_highlight
10
- from src.application.text.helper import extract_equal_text
 
 
 
11
  from src.application.text.text import TextDetector
12
 
13
 
@@ -17,32 +20,34 @@ def create_governor_table(
17
  image: ImageDetector,
18
  ):
19
  rows = []
20
- rows.append(format_image_governor_row(image))
 
21
 
22
- for _, row in aligned_sentences_df.iterrows():
23
- if row["input"] is None:
24
- continue
 
25
 
26
- if row["source"] is None:
27
- equal_idx_1 = equal_idx_2 = []
28
- else:
29
- # Get index of equal phrases in input and source sentences
30
- equal_idx_1, equal_idx_2 = extract_equal_text(
31
- row["input"],
32
- row["source"],
 
 
 
 
 
 
 
 
 
33
  )
34
 
35
- text.governor_table.append(
36
- [
37
- row,
38
- equal_idx_1,
39
- equal_idx_2,
40
- row["entities"],
41
- ],
42
- )
43
-
44
- formatted_row = format_text_governor_row(text)
45
- rows.append(formatted_row)
46
 
47
  table = "\n".join(rows)
48
  return f"""
@@ -123,9 +128,11 @@ def format_text_governor_row(text):
123
  source_sentence = row[0]["source"]
124
  input_sentence = row[0]["input"]
125
 
126
- # convert score to HUMAN-based score:
127
- input_sentences += input_sentence + "<br><br>"
128
- source_sentences += source_sentence + "<br><br>"
 
 
129
 
130
  url = row[0]["url"]
131
  if url not in urls:
@@ -149,6 +156,9 @@ def format_text_governor_row(text):
149
 
150
 
151
  def format_image_governor_row(image):
 
 
 
152
  if image.referent_url is not None or image.referent_url != "":
153
  source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
154
  source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
 
7
  )
8
  from src.application.image.image import ImageDetector
9
  from src.application.text.entity import apply_highlight
10
+ from src.application.text.helper import (
11
+ extract_equal_text,
12
+ replace_leading_spaces,
13
+ )
14
  from src.application.text.text import TextDetector
15
 
16
 
 
20
  image: ImageDetector,
21
  ):
22
  rows = []
23
+ if image.input is not None:
24
+ rows.append(format_image_governor_row(image))
25
 
26
+ if text.input is not None:
27
+ for _, row in aligned_sentences_df.iterrows():
28
+ if row["input"] is None:
29
+ continue
30
 
31
+ if row["source"] is None:
32
+ equal_idx_1 = equal_idx_2 = []
33
+ else:
34
+ # Get index of equal phrases in input and source sentences
35
+ equal_idx_1, equal_idx_2 = extract_equal_text(
36
+ row["input"],
37
+ row["source"],
38
+ )
39
+
40
+ text.governor_table.append(
41
+ [
42
+ row,
43
+ equal_idx_1,
44
+ equal_idx_2,
45
+ row["entities"],
46
+ ],
47
  )
48
 
49
+ formatted_row = format_text_governor_row(text)
50
+ rows.append(formatted_row)
 
 
 
 
 
 
 
 
 
51
 
52
  table = "\n".join(rows)
53
  return f"""
 
128
  source_sentence = row[0]["source"]
129
  input_sentence = row[0]["input"]
130
 
131
+ input_sentence = replace_leading_spaces(input_sentence)
132
+ source_sentence = replace_leading_spaces(source_sentence)
133
+
134
+ input_sentences += input_sentence + "<br>"
135
+ source_sentences += source_sentence + "<br>"
136
 
137
  url = row[0]["url"]
138
  if url not in urls:
 
156
 
157
 
158
  def format_image_governor_row(image):
159
+ if image.input is None:
160
+ return ""
161
+
162
  if image.referent_url is not None or image.referent_url != "":
163
  source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
164
  source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
src/application/formatting_ordinary_user.py CHANGED
@@ -2,6 +2,7 @@ from pandas import DataFrame
2
 
3
  from src.application.config import WORD_BREAK
4
  from src.application.image.image import ImageDetector
 
5
  from src.application.text.text import TextDetector
6
 
7
 
@@ -10,9 +11,26 @@ def create_ordinary_user_table(
10
  text: TextDetector,
11
  image: ImageDetector,
12
  ) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
13
  rows = []
14
- rows.append(format_image_ordinary_user_row(image))
15
- rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
 
 
 
 
 
16
  table = "\n".join(rows)
17
 
18
  return f"""
@@ -32,8 +50,6 @@ def create_ordinary_user_table(
32
  {table}
33
  </tbody>
34
  </table>
35
-
36
- <style>
37
  """
38
 
39
 
@@ -41,6 +57,16 @@ def format_text_ordinary_user_row(
41
  aligned_sentences_df,
42
  text,
43
  ) -> str:
 
 
 
 
 
 
 
 
 
 
44
  input_sentences = ""
45
  source_text_html = ""
46
  urls = []
@@ -48,7 +74,7 @@ def format_text_ordinary_user_row(
48
  if row["input"] is None:
49
  continue
50
 
51
- input_sentences += row["input"] + "<br><br>"
52
  url = row["url"]
53
  if url not in urls:
54
  urls.append(url)
@@ -66,15 +92,16 @@ def format_text_ordinary_user_row(
66
 
67
  def format_image_ordinary_user_row(image: ImageDetector) -> str:
68
  """
69
- Formats an HTML table row for ordinary users,
70
- displaying image analysis results.
71
 
72
  Args:
73
- image (ImageDetector): The image to be analyzed.
74
 
75
  Returns:
76
- str: An HTML table row string containing the image analysis results.
77
  """
 
 
78
 
79
  # Put image, label, and score into html tag
80
  if image.referent_url is not None or image.referent_url != "":
 
2
 
3
  from src.application.config import WORD_BREAK
4
  from src.application.image.image import ImageDetector
5
+ from src.application.text.helper import replace_leading_spaces
6
  from src.application.text.text import TextDetector
7
 
8
 
 
11
  text: TextDetector,
12
  image: ImageDetector,
13
  ) -> str:
14
+ """
15
+ Creates an HTML table comparing input news with source news
16
+ for ordinary users.
17
+
18
+ Args:
19
+ aligned_sentences_df (DataFrame): Aligned sentence data.
20
+ text (TextDetector): Text comparison data.
21
+ image (ImageDetector): Image comparison data.
22
+
23
+ Returns:
24
+ A string representing the HTML table.
25
+ """
26
  rows = []
27
+
28
+ if image.input is not None:
29
+ rows.append(format_image_ordinary_user_row(image))
30
+
31
+ if text.input is not None:
32
+ rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
33
+
34
  table = "\n".join(rows)
35
 
36
  return f"""
 
50
  {table}
51
  </tbody>
52
  </table>
 
 
53
  """
54
 
55
 
 
57
  aligned_sentences_df,
58
  text,
59
  ) -> str:
60
+ """
61
+ Formats a row for the text in the ordinary user table.
62
+
63
+ Args:
64
+ aligned_sentences_df (DataFrame): Aligned sentence data.
65
+ text (TextDetector): Text comparison data.
66
+
67
+ Returns:
68
+ A string representing the HTML table row for the text.
69
+ """
70
  input_sentences = ""
71
  source_text_html = ""
72
  urls = []
 
74
  if row["input"] is None:
75
  continue
76
 
77
+ input_sentences += replace_leading_spaces(row["input"]) + "<br>"
78
  url = row["url"]
79
  if url not in urls:
80
  urls.append(url)
 
92
 
93
  def format_image_ordinary_user_row(image: ImageDetector) -> str:
94
  """
95
+ Formats a row for the image in the ordinary user table.
 
96
 
97
  Args:
98
+ image: Image comparison data.
99
 
100
  Returns:
101
+ A string representing the HTML table row for the image.
102
  """
103
+ if image.input is None:
104
+ return ""
105
 
106
  # Put image, label, and score into html tag
107
  if image.referent_url is not None or image.referent_url != "":
src/application/image/helper.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+
4
+ def encode_image(image_path):
5
+ with open(image_path, "rb") as img_file:
6
+ return base64.b64encode(img_file.read()).decode()
7
+
8
+
9
+ image_base64 = encode_image(
10
+ "/content/ai-generated-picture-of-a-tiger-walking-in-the-forest-photo.jpg",
11
+ )
12
+ html_code = f'<img src="data:image/jpeg;base64,{image_base64}" width="300">'
src/application/image/image.py CHANGED
@@ -1,5 +1,6 @@
1
  class ImageDetector:
2
  def __init__(self):
 
3
  self.referent_url: str = None # URL of the referenced image.
4
  self.prediction_label: str = None
5
  self.prediction_score: float = None
 
1
  class ImageDetector:
2
  def __init__(self):
3
+ self.input = None
4
  self.referent_url: str = None # URL of the referenced image.
5
  self.prediction_label: str = None
6
  self.prediction_score: float = None
src/application/text/ai_classification.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import (
2
+ Dict,
3
+ List,
4
+ Tuple,
5
+ )
6
+
7
+ import torch
8
+ from transformers import (
9
+ AutoModelForSequenceClassification,
10
+ AutoTokenizer,
11
+ )
12
+
13
+ from src.application.config import AI_TEXT_CLASSIFICATION_MODEL
14
+
15
+
16
+ def load_model_and_tokenizer(
17
+ model_path: str = AI_TEXT_CLASSIFICATION_MODEL,
18
+ ) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]:
19
+ """
20
+ Loads the trained model and tokenizer from the specified path.
21
+
22
+ Args:
23
+ model_path: path of directory containing the saved model and tokenizer.
24
+
25
+ Returns:
26
+ A tuple containing the loaded tokenizer and model.
27
+ """
28
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
30
+ model.eval()
31
+ return tokenizer, model
32
+
33
+
34
+ def predict(
35
+ texts: List[str],
36
+ model: AutoModelForSequenceClassification,
37
+ tokenizer: AutoTokenizer,
38
+ ) -> List[Dict[str, str]]:
39
+ """
40
+ Classify on input texts into gpt-4o or gpt-4o-mini.
41
+
42
+ Args:
43
+ texts: A list of input text strings to be classified.
44
+ model: The loaded model for sequence classification.
45
+ tokenizer: The loaded tokenizer.
46
+
47
+ Returns:
48
+ A list of dictionaries, where each dictionary contains the input text,
49
+ the predicted label, and the confidence score.
50
+ """
51
+ label_map = {0: "GPT-4o", 1: "GPT-4o mini"}
52
+ inputs = tokenizer(
53
+ texts,
54
+ padding="max_length",
55
+ truncation=True,
56
+ return_tensors="pt",
57
+ )
58
+ with torch.no_grad():
59
+ outputs = model(**inputs)
60
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
61
+ confidence, predictions = torch.max(probabilities, dim=-1)
62
+
63
+ results = []
64
+ for text, pred, conf in zip(
65
+ texts,
66
+ predictions.tolist(),
67
+ confidence.tolist(),
68
+ ):
69
+ results.append(
70
+ {"input": text, "prediction": label_map[pred], "confidence": conf},
71
+ )
72
+
73
+ return results
74
+
75
+
76
+ if __name__ == "__main__":
77
+ text = """The resignation brings a long political chapter to an end.
78
+ Trudeau has been in office since 2015, when he brought the Liberals back
79
+ to power from the political wilderness.
80
+ """
81
+
82
+ tokenizer, model = load_model_and_tokenizer("ductuan024/gpts-detector")
83
+ predictions = predict(text, model, tokenizer)
84
+
85
+ print(predictions[0]["prediction"])
86
+ print(predictions[0]["confidence"])
src/application/text/helper.py CHANGED
@@ -15,6 +15,8 @@ from nltk.tokenize import (
15
  from nltk.util import ngrams
16
  from sklearn.feature_extraction.text import TfidfVectorizer
17
 
 
 
18
 
19
  def clean_text(text: str) -> str:
20
  """
@@ -122,9 +124,7 @@ def get_important_sentences(
122
  list: A list of important sentences.
123
  """
124
  # Clean and split the sentence into sentences
125
- sentences = [
126
- s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
127
- ]
128
 
129
  # Calculate the importance score for each sentence
130
  sentence_scores = []
@@ -293,13 +293,16 @@ def postprocess_label(labels: list[str]) -> str:
293
  Returns:
294
  A string with the formatted label.
295
  """
296
- prefix = "Partially generated by "
297
  for index, label in enumerate(labels):
298
- if label.startswith(prefix):
299
- labels[index] = label[len(prefix) :]
 
 
300
 
301
  labels = list(set(labels))
302
- label = prefix
 
303
 
304
  if len(labels) == 1:
305
  label += labels[0]
@@ -362,7 +365,7 @@ def split_into_paragraphs(input_text: str) -> list[str]:
362
 
363
  for paragraph in paragraphs:
364
  # Remove leading/trailing whitespace
365
- paragraph = paragraph.strip()
366
 
367
  if paragraph and paragraph != "\n":
368
  # Append the cleaned paragraph to the output list.
@@ -460,6 +463,33 @@ def filter_indices(
460
  return filtered_starts, filtered_ends
461
 
462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  def extract_new_startend(
464
  start: int,
465
  end: int,
 
15
  from nltk.util import ngrams
16
  from sklearn.feature_extraction.text import TfidfVectorizer
17
 
18
+ from src.application.config import PREFIX
19
+
20
 
21
  def clean_text(text: str) -> str:
22
  """
 
124
  list: A list of important sentences.
125
  """
126
  # Clean and split the sentence into sentences
127
+ sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s]
 
 
128
 
129
  # Calculate the importance score for each sentence
130
  sentence_scores = []
 
293
  Returns:
294
  A string with the formatted label.
295
  """
296
+
297
  for index, label in enumerate(labels):
298
+ # if label.startswith(PREFIX):
299
+ # labels[index] = label[len(PREFIX) :]
300
+ if PREFIX in label:
301
+ labels[index] = label.replace(PREFIX, "")
302
 
303
  labels = list(set(labels))
304
+
305
+ label = ""
306
 
307
  if len(labels) == 1:
308
  label += labels[0]
 
365
 
366
  for paragraph in paragraphs:
367
  # Remove leading/trailing whitespace
368
+ # paragraph = paragraph.strip()
369
 
370
  if paragraph and paragraph != "\n":
371
  # Append the cleaned paragraph to the output list.
 
463
  return filtered_starts, filtered_ends
464
 
465
 
466
+ def replace_leading_spaces(text: str) -> str:
467
+ """
468
+ Replaces leading spaces in a string with '&nbsp;'.
469
+
470
+ Args:
471
+ text: The input string.
472
+
473
+ Returns:
474
+ The string with leading spaces replaced by '&nbsp;'.
475
+ """
476
+
477
+ if text is None:
478
+ return None
479
+
480
+ leading_spaces = 0
481
+ for char in text:
482
+ if char == " ":
483
+ leading_spaces += 1
484
+ else:
485
+ break
486
+
487
+ if leading_spaces > 0:
488
+ return "&nbsp;" * leading_spaces + text[leading_spaces:]
489
+ else:
490
+ return text
491
+
492
+
493
  def extract_new_startend(
494
  start: int,
495
  end: int,
src/application/text/model_detection.py CHANGED
@@ -15,8 +15,13 @@ from src.application.config import (
15
  HUMAN,
16
  MODEL_HUMAN_LABEL,
17
  PARAPHRASE_MODEL,
 
18
  UNKNOWN,
19
  )
 
 
 
 
20
 
21
 
22
  def detect_text_by_ai_model(
@@ -63,7 +68,7 @@ def detect_text_by_ai_model(
63
  else:
64
  # label = MACHINE
65
  generated_model, _ = predict_generation_model(input_text)
66
- label = f"Partially generated by {generated_model}"
67
 
68
  return label, confidence_score
69
 
@@ -75,6 +80,24 @@ def detect_text_by_ai_model(
75
  def predict_generation_model(text: str) -> tuple[str, float]:
76
  """
77
  Predicts if text is generated by gpt-4o or gpt-4o-mini models.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  Compares the input text against the paraphrased text by the models.
79
 
80
  Args:
@@ -82,8 +105,8 @@ def predict_generation_model(text: str) -> tuple[str, float]:
82
 
83
  Returns:
84
  tuple: (label, confidence_score)
85
- where label is gpt-4o or gpt-4o-mini,
86
- and confidence_score is the highest similarity.
87
  """
88
  best_similarity = 0
89
  best_model = GPT_PARAPHRASE_MODELS[0]
 
15
  HUMAN,
16
  MODEL_HUMAN_LABEL,
17
  PARAPHRASE_MODEL,
18
+ PREFIX,
19
  UNKNOWN,
20
  )
21
+ from src.application.text.ai_classification import (
22
+ load_model_and_tokenizer,
23
+ predict,
24
+ )
25
 
26
 
27
  def detect_text_by_ai_model(
 
68
  else:
69
  # label = MACHINE
70
  generated_model, _ = predict_generation_model(input_text)
71
+ label = f"{PREFIX}{generated_model}"
72
 
73
  return label, confidence_score
74
 
 
80
  def predict_generation_model(text: str) -> tuple[str, float]:
81
  """
82
  Predicts if text is generated by gpt-4o or gpt-4o-mini models.
83
+
84
+ Args:
85
+ text (str): The input text to be analyzed.
86
+
87
+ Returns:
88
+ tuple: (label, confidence_score)
89
+ where label is gpt-4o or gpt-4o-mini,
90
+ and confidence_score is the highest similarity.
91
+ """
92
+ tokenizer, model = load_model_and_tokenizer()
93
+ predictions = predict(text, model, tokenizer)
94
+
95
+ return predictions[0]["prediction"], predictions[0]["confidence"]
96
+
97
+
98
+ def predict_generation_model_by_reparaphrasing(text: str) -> tuple[str, float]:
99
+ """
100
+ Predicts if text is generated by gpt-4o or gpt-4o-mini models.
101
  Compares the input text against the paraphrased text by the models.
102
 
103
  Args:
 
105
 
106
  Returns:
107
  tuple: (label, confidence_score)
108
+ where label is gpt-4o or gpt-4o-mini,
109
+ and confidence_score is the highest similarity.
110
  """
111
  best_similarity = 0
112
  best_model = GPT_PARAPHRASE_MODELS[0]
src/application/text/search_detection.py CHANGED
@@ -3,8 +3,8 @@ Author: Khanh Phan
3
  Date: 2024-12-04
4
  """
5
 
6
- from typing import Optional
7
  import warnings
 
8
 
9
  import numpy as np
10
  from pandas import DataFrame
@@ -14,6 +14,7 @@ from src.application.config import (
14
  DEVICE,
15
  MAX_CHAR_SIZE,
16
  PARAPHRASE_MODEL,
 
17
  PARAPHRASE_THRESHOLD_HUMAN,
18
  PARAPHRASE_THRESHOLD_MACHINE,
19
  TOP_URLS_PER_SEARCH,
@@ -96,15 +97,22 @@ def find_sentence_source(
96
  )
97
  return sentences_df, []
98
 
99
- # assign values
100
- columns = [
101
- "input",
102
- "source",
103
- "label",
104
- "similarity",
105
- "paraphrase",
106
- "url",
107
- ]
 
 
 
 
 
 
 
108
  for c in columns:
109
  if c in sentences_df.columns:
110
  sentences_df.loc[text_index, c] = aligned_sentence[c]
@@ -126,13 +134,22 @@ def find_sentence_source(
126
  similarity is None
127
  or aligned_sentence["similarity"] > similarity
128
  ):
129
- columns = [
130
- "input",
131
- "source",
132
- "label",
133
- "similarity",
134
- "url",
135
- ]
 
 
 
 
 
 
 
 
 
136
  for c in columns:
137
  if c in sentences_df.columns:
138
  sentences_df.loc[idx, c] = aligned_sentence[c]
 
3
  Date: 2024-12-04
4
  """
5
 
 
6
  import warnings
7
+ from typing import Optional
8
 
9
  import numpy as np
10
  from pandas import DataFrame
 
14
  DEVICE,
15
  MAX_CHAR_SIZE,
16
  PARAPHRASE_MODEL,
17
+ PARAPHRASE_THRESHOLD,
18
  PARAPHRASE_THRESHOLD_HUMAN,
19
  PARAPHRASE_THRESHOLD_MACHINE,
20
  TOP_URLS_PER_SEARCH,
 
97
  )
98
  return sentences_df, []
99
 
100
+ if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD:
101
+ columns = [
102
+ "input",
103
+ "source",
104
+ "label",
105
+ "similarity",
106
+ "paraphrase",
107
+ "url",
108
+ ]
109
+ else:
110
+ columns = [
111
+ "input",
112
+ "label",
113
+ "paraphrase",
114
+ ]
115
+
116
  for c in columns:
117
  if c in sentences_df.columns:
118
  sentences_df.loc[text_index, c] = aligned_sentence[c]
 
134
  similarity is None
135
  or aligned_sentence["similarity"] > similarity
136
  ):
137
+ if (
138
+ aligned_sentence["similarity"]
139
+ > PARAPHRASE_THRESHOLD
140
+ ):
141
+ columns = [
142
+ "input",
143
+ "source",
144
+ "label",
145
+ "similarity",
146
+ "url",
147
+ ]
148
+ else:
149
+ columns = [
150
+ "input",
151
+ "label",
152
+ ]
153
  for c in columns:
154
  if c in sentences_df.columns:
155
  sentences_df.loc[idx, c] = aligned_sentence[c]
src/application/text/text.py CHANGED
@@ -3,6 +3,8 @@ import pandas as pd
3
 
4
  class TextDetector:
5
  def __init__(self):
 
 
6
  self.prediction_label: list[str] = ["UNKNOWN"]
7
  self.prediction_score: list[float] = [0.0]
8
 
 
3
 
4
  class TextDetector:
5
  def __init__(self):
6
+ self.input = None
7
+
8
  self.prediction_label: list[str] = ["UNKNOWN"]
9
  self.prediction_score: list[float] = [0.0]
10
 
src/application/url_reader.py CHANGED
@@ -78,7 +78,7 @@ class URLReader:
78
 
79
  soup = BeautifulSoup(response.content, "html.parser")
80
 
81
- self.title = soup.title.string.strip() if soup.title else None
82
 
83
  image_urls = [img["src"] for img in soup.find_all("img")]
84
  self.images = image_urls
 
78
 
79
  soup = BeautifulSoup(response.content, "html.parser")
80
 
81
+ self.title = soup.title.string if soup.title else None
82
 
83
  image_urls = [img["src"] for img in soup.find_all("img")]
84
  self.images = image_urls
test.py CHANGED
@@ -1,27 +1,39 @@
1
- def postprocess_label(labels: list[str]) -> str:
2
  """
3
- Creates a label string with the format
4
- "Partially generated by [label1] and [label2] and ...".
5
- Removes duplicate labels while preserving the original order.
6
 
7
  Args:
8
- labels: A list of strings representing labels.
9
 
10
  Returns:
11
- A string with the formatted label.
12
  """
13
- labels = list(set(labels))
14
- label = "Partially generated by "
15
- if len(label) == 1:
16
- label += labels[0]
17
- elif len(labels) == 2:
18
- label += f"{labels[0]} and {labels[1]}"
 
 
 
 
19
  else:
20
- combination = ", ".join(labels[0 : len(labels) - 1])
21
- label += f"{combination}, and {labels[-1]}"
22
- return label
 
 
 
 
 
23
 
 
 
 
 
24
 
25
- labels = ["gpt-4o", "gpt-4o-mini", "gpt-4o-l"]
26
- postprocessed_label = postprocess_label(labels)
27
- print(postprocessed_label)
 
 
1
+ def replace_leading_spaces(text):
2
  """
3
+ Replaces leading spaces in a string with '&nbsp;'.
 
 
4
 
5
  Args:
6
+ text: The input string.
7
 
8
  Returns:
9
+ The string with leading spaces replaced by '&nbsp;'.
10
  """
11
+
12
+ leading_spaces = 0
13
+ for char in text:
14
+ if char == " ":
15
+ leading_spaces += 1
16
+ else:
17
+ break
18
+
19
+ if leading_spaces > 0:
20
+ return "&nbsp;" * leading_spaces + text[leading_spaces:]
21
  else:
22
+ return text
23
+
24
+
25
+ # Example usage:
26
+ text1 = " Hello, world!"
27
+ text2 = "No leading spaces."
28
+ text3 = " Another example."
29
+ text4 = "\t Test with tabs" # this will not be replaced, only standard spaces
30
 
31
+ result1 = replace_leading_spaces(text1)
32
+ result2 = replace_leading_spaces(text2)
33
+ result3 = replace_leading_spaces(text3)
34
+ result4 = replace_leading_spaces(text4)
35
 
36
+ print(f"'{text1}' becomes '{result1}'")
37
+ print(f"'{text2}' becomes '{result2}'")
38
+ print(f"'{text3}' becomes '{result3}'")
39
+ print(f"'{text4}' becomes '{result4}'")