Spaces:
Sleeping
Sleeping
Commit
·
00b1038
1
Parent(s):
e58707f
refactor code + fix bug of label after grouping url
Browse files- application_2.py +0 -0
- gpt_test.py +4 -3
- requirements.txt +2 -0
- src/application/config.py +6 -0
- src/application/content_detection.py +11 -4
- src/application/content_generation.py +1 -1
- src/application/formatting_fact_checker.py +77 -61
- src/application/formatting_governor.py +36 -26
- src/application/formatting_ordinary_user.py +36 -9
- src/application/image/helper.py +12 -0
- src/application/image/image.py +1 -0
- src/application/text/ai_classification.py +86 -0
- src/application/text/helper.py +38 -8
- src/application/text/model_detection.py +26 -3
- src/application/text/search_detection.py +34 -17
- src/application/text/text.py +2 -0
- src/application/url_reader.py +1 -1
- test.py +30 -18
application_2.py
DELETED
File without changes
|
gpt_test.py
CHANGED
@@ -91,9 +91,10 @@ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
91 |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
92 |
|
93 |
azure_client = AzureOpenAI(
|
94 |
-
azure_endpoint=
|
95 |
api_key=AZURE_OPENAI_API_KEY,
|
96 |
-
api_version=
|
|
|
97 |
)
|
98 |
|
99 |
deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
@@ -127,4 +128,4 @@ for index, news in enumerate(text):
|
|
127 |
count += 1
|
128 |
paraphrased_news = response.choices[0].message.content
|
129 |
|
130 |
-
add_text_to_csv("data/
|
|
|
91 |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
92 |
|
93 |
azure_client = AzureOpenAI(
|
94 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
95 |
api_key=AZURE_OPENAI_API_KEY,
|
96 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
97 |
+
# API VERSION=[ 2024-12-01-preview, 2024-05-01-preview]
|
98 |
)
|
99 |
|
100 |
deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
|
|
128 |
count += 1
|
129 |
paraphrased_news = response.choices[0].message.content
|
130 |
|
131 |
+
add_text_to_csv("data/test.csv", paraphrased_news, count)
|
requirements.txt
CHANGED
@@ -17,8 +17,10 @@ scikit-learn
|
|
17 |
nltk
|
18 |
numpy
|
19 |
torch
|
|
|
20 |
sentence-transformers
|
21 |
accelerate
|
|
|
22 |
|
23 |
# Images
|
24 |
pillow==10.1.0
|
|
|
17 |
nltk
|
18 |
numpy
|
19 |
torch
|
20 |
+
tokenizers
|
21 |
sentence-transformers
|
22 |
accelerate
|
23 |
+
sentencepiece
|
24 |
|
25 |
# Images
|
26 |
pillow==10.1.0
|
src/application/config.py
CHANGED
@@ -43,6 +43,9 @@ PARAPHRASE_MODEL.to(DEVICE)
|
|
43 |
# Model to detect AI-generated text
|
44 |
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
|
45 |
|
|
|
|
|
|
|
46 |
# Thresholds
|
47 |
PARAPHRASE_THRESHOLD_HUMAN = 0.963
|
48 |
PARAPHRASE_THRESHOLD_MACHINE = 0.8
|
@@ -89,3 +92,6 @@ ENTITY_BRIGHTNESS = 0.75 # color's brightness.
|
|
89 |
|
90 |
# HTML formatting
|
91 |
WORD_BREAK = "word-break: break-all;"
|
|
|
|
|
|
|
|
43 |
# Model to detect AI-generated text
|
44 |
AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B"
|
45 |
|
46 |
+
# Model to classify AI-generated text
|
47 |
+
AI_TEXT_CLASSIFICATION_MODEL = "ductuan024/gpts-detector"
|
48 |
+
|
49 |
# Thresholds
|
50 |
PARAPHRASE_THRESHOLD_HUMAN = 0.963
|
51 |
PARAPHRASE_THRESHOLD_MACHINE = 0.8
|
|
|
92 |
|
93 |
# HTML formatting
|
94 |
WORD_BREAK = "word-break: break-all;"
|
95 |
+
|
96 |
+
# Prefix for output MACHINE label of text
|
97 |
+
PREFIX = "Partially generated by "
|
src/application/content_detection.py
CHANGED
@@ -74,8 +74,7 @@ class NewsVerification:
|
|
74 |
news_image (str): The url of image in news article.
|
75 |
"""
|
76 |
# Combine title and content for a full text representation.
|
77 |
-
|
78 |
-
self.news_text = (news_title + "\n\n" + news_content).strip()
|
79 |
|
80 |
# if not isinstance(news_title, str) or not isinstance(
|
81 |
# news_content,
|
@@ -90,6 +89,9 @@ class NewsVerification:
|
|
90 |
self.news_content = news_content
|
91 |
self.news_image = news_image
|
92 |
|
|
|
|
|
|
|
93 |
def group_by_url(self):
|
94 |
"""
|
95 |
Groups aligned sentences by URL
|
@@ -138,6 +140,7 @@ class NewsVerification:
|
|
138 |
# Detect text origin using an AI model.
|
139 |
label, score = detect_text_by_ai_model(text)
|
140 |
|
|
|
141 |
self.text.grouped_url_df.at[index, "label"] = label
|
142 |
self.text.grouped_url_df.at[index, "score"] = score
|
143 |
|
@@ -169,6 +172,7 @@ class NewsVerification:
|
|
169 |
na=False,
|
170 |
)
|
171 |
]
|
|
|
172 |
|
173 |
if not machine_label.empty:
|
174 |
# If 'gpt-4o' labels are found, post-process and assign.
|
@@ -185,7 +189,9 @@ class NewsVerification:
|
|
185 |
self.aligned_sentences_df["label"] == "HUMAN"
|
186 |
]
|
187 |
self.text.prediction_label[0] = "HUMAN"
|
188 |
-
self.text.prediction_score[0] =
|
|
|
|
|
189 |
else:
|
190 |
# If no found URLs, use AI detection on the entire input text.
|
191 |
print("No source found in the input text")
|
@@ -306,7 +312,7 @@ class NewsVerification:
|
|
306 |
|
307 |
# Check if a ratio of remaining filtering-sentences is more than 50%.
|
308 |
if (
|
309 |
-
len(filtered_by_similarity) / len(
|
310 |
> MIN_RATIO_PARAPHRASE_NUM
|
311 |
):
|
312 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
@@ -319,6 +325,7 @@ class NewsVerification:
|
|
319 |
)
|
320 |
.any()
|
321 |
)
|
|
|
322 |
|
323 |
# TODO: integrate with determine_text_origin
|
324 |
if contains_machine:
|
|
|
74 |
news_image (str): The url of image in news article.
|
75 |
"""
|
76 |
# Combine title and content for a full text representation.
|
77 |
+
self.news_text = news_title + "\n\n" + news_content
|
|
|
78 |
|
79 |
# if not isinstance(news_title, str) or not isinstance(
|
80 |
# news_content,
|
|
|
89 |
self.news_content = news_content
|
90 |
self.news_image = news_image
|
91 |
|
92 |
+
self.text.input = self.news_text
|
93 |
+
self.image.input = news_image
|
94 |
+
|
95 |
def group_by_url(self):
|
96 |
"""
|
97 |
Groups aligned sentences by URL
|
|
|
140 |
# Detect text origin using an AI model.
|
141 |
label, score = detect_text_by_ai_model(text)
|
142 |
|
143 |
+
print(f"labels = {label}")
|
144 |
self.text.grouped_url_df.at[index, "label"] = label
|
145 |
self.text.grouped_url_df.at[index, "score"] = score
|
146 |
|
|
|
172 |
na=False,
|
173 |
)
|
174 |
]
|
175 |
+
print(f" machine_label = {machine_label}")
|
176 |
|
177 |
if not machine_label.empty:
|
178 |
# If 'gpt-4o' labels are found, post-process and assign.
|
|
|
189 |
self.aligned_sentences_df["label"] == "HUMAN"
|
190 |
]
|
191 |
self.text.prediction_label[0] = "HUMAN"
|
192 |
+
self.text.prediction_score[0] = self.text.grouped_url_df[
|
193 |
+
"score"
|
194 |
+
].mean()
|
195 |
else:
|
196 |
# If no found URLs, use AI detection on the entire input text.
|
197 |
print("No source found in the input text")
|
|
|
312 |
|
313 |
# Check if a ratio of remaining filtering-sentences is more than 50%.
|
314 |
if (
|
315 |
+
len(filtered_by_similarity) / len(filtered_by_url)
|
316 |
> MIN_RATIO_PARAPHRASE_NUM
|
317 |
):
|
318 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
|
|
325 |
)
|
326 |
.any()
|
327 |
)
|
328 |
+
print(f"contain_machine = \n{contains_machine}")
|
329 |
|
330 |
# TODO: integrate with determine_text_origin
|
331 |
if contains_machine:
|
src/application/content_generation.py
CHANGED
@@ -86,7 +86,7 @@ def extract_title_content(fake_news: str) -> tuple[str, str]:
|
|
86 |
title_start = fake_news.find("# Title: ") + len("# Title: ")
|
87 |
title_end = fake_news.find("\n", title_start)
|
88 |
if title_start != -1 and title_end != -1:
|
89 |
-
title = fake_news[title_start:title_end].strip()
|
90 |
|
91 |
title_start = fake_news.find("\n# Content: ") + len(
|
92 |
"\n# Content: ",
|
|
|
86 |
title_start = fake_news.find("# Title: ") + len("# Title: ")
|
87 |
title_end = fake_news.find("\n", title_start)
|
88 |
if title_start != -1 and title_end != -1:
|
89 |
+
title = fake_news[title_start:title_end] # .strip()
|
90 |
|
91 |
title_start = fake_news.find("\n# Content: ") + len(
|
92 |
"\n# Content: ",
|
src/application/formatting_fact_checker.py
CHANGED
@@ -7,7 +7,10 @@ from src.application.formatting import (
|
|
7 |
)
|
8 |
from src.application.image.image import ImageDetector
|
9 |
from src.application.text.entity import apply_highlight
|
10 |
-
from src.application.text.helper import
|
|
|
|
|
|
|
11 |
from src.application.text.text import TextDetector
|
12 |
|
13 |
|
@@ -17,63 +20,66 @@ def create_fact_checker_table(
|
|
17 |
image: ImageDetector,
|
18 |
):
|
19 |
rows = []
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
)
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
formatted_row = format_text_fact_checker_row(
|
70 |
-
text,
|
71 |
-
row,
|
72 |
-
first_url_row,
|
73 |
-
last_url_row,
|
74 |
-
span_row,
|
75 |
-
)
|
76 |
-
rows.append(formatted_row)
|
77 |
|
78 |
table = "\n".join(rows)
|
79 |
return f"""
|
@@ -102,9 +108,9 @@ def create_fact_checker_table(
|
|
102 |
def format_text_fact_checker_row(
|
103 |
text: TextDetector,
|
104 |
row: list,
|
105 |
-
first_url_row: bool=True,
|
106 |
-
last_url_row: bool=True,
|
107 |
-
span_row: int=1,
|
108 |
):
|
109 |
entity_count = 0
|
110 |
print(f"row: {row}")
|
@@ -158,10 +164,14 @@ def format_text_fact_checker_row(
|
|
158 |
input_sentence = row[0]["input"]
|
159 |
source_sentence = row[0]["source"]
|
160 |
|
|
|
|
|
|
|
161 |
url = row[0]["url"]
|
162 |
|
163 |
# Displayed label and score by url
|
164 |
filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
|
|
|
165 |
if len(filterby_url) > 0:
|
166 |
label = filterby_url["label"].values[0]
|
167 |
score = filterby_url["score"].values[0]
|
@@ -170,7 +180,10 @@ def format_text_fact_checker_row(
|
|
170 |
score = text.prediction_score[0]
|
171 |
|
172 |
# Format displayed url
|
173 |
-
|
|
|
|
|
|
|
174 |
|
175 |
# Format displayed entity count
|
176 |
entity_count_text = format_entity_count(entity_count)
|
@@ -220,7 +233,10 @@ def format_text_fact_checker_row(
|
|
220 |
"""
|
221 |
|
222 |
|
223 |
-
def format_image_fact_checker_row(image):
|
|
|
|
|
|
|
224 |
if image.referent_url is not None or image.referent_url != "":
|
225 |
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
|
226 |
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
|
|
|
7 |
)
|
8 |
from src.application.image.image import ImageDetector
|
9 |
from src.application.text.entity import apply_highlight
|
10 |
+
from src.application.text.helper import (
|
11 |
+
extract_equal_text,
|
12 |
+
replace_leading_spaces,
|
13 |
+
)
|
14 |
from src.application.text.text import TextDetector
|
15 |
|
16 |
|
|
|
20 |
image: ImageDetector,
|
21 |
):
|
22 |
rows = []
|
23 |
+
if image.input is not None:
|
24 |
+
rows.append(format_image_fact_checker_row(image))
|
25 |
+
|
26 |
+
if text.input is not None:
|
27 |
+
for _, row in aligned_sentences_df.iterrows():
|
28 |
+
if row["input"] is None:
|
29 |
+
continue
|
30 |
+
|
31 |
+
if row["source"] is None:
|
32 |
+
equal_idx_1 = equal_idx_2 = []
|
33 |
+
|
34 |
+
else: # Get index of equal phrases in input and source sentences
|
35 |
+
equal_idx_1, equal_idx_2 = extract_equal_text(
|
36 |
+
row["input"],
|
37 |
+
row["source"],
|
38 |
+
)
|
39 |
+
|
40 |
+
text.fact_checker_table.append(
|
41 |
+
[
|
42 |
+
row, # aligned_sentences_df
|
43 |
+
equal_idx_1, # index of equal text in input
|
44 |
+
equal_idx_2, # index of equal text in source
|
45 |
+
row["entities"],
|
46 |
+
row["url"],
|
47 |
+
],
|
48 |
)
|
49 |
|
50 |
+
previous_url = None
|
51 |
+
span_row = 1
|
52 |
+
for index, row in enumerate(text.fact_checker_table):
|
53 |
+
current_url = row[4]
|
54 |
+
last_url_row = False
|
55 |
+
|
56 |
+
# First row or URL change
|
57 |
+
if index == 0 or current_url != previous_url:
|
58 |
+
first_url_row = True
|
59 |
+
previous_url = current_url
|
60 |
+
# Increase counter "span_row" when the next url is the same
|
61 |
+
while (
|
62 |
+
index + span_row < len(text.fact_checker_table)
|
63 |
+
and text.fact_checker_table[index + span_row][4]
|
64 |
+
== current_url
|
65 |
+
):
|
66 |
+
span_row += 1
|
67 |
+
|
68 |
+
else:
|
69 |
+
first_url_row = False
|
70 |
+
span_row -= 1
|
71 |
+
|
72 |
+
if span_row == 1:
|
73 |
+
last_url_row = True
|
74 |
+
|
75 |
+
formatted_row = format_text_fact_checker_row(
|
76 |
+
text,
|
77 |
+
row,
|
78 |
+
first_url_row,
|
79 |
+
last_url_row,
|
80 |
+
span_row,
|
81 |
+
)
|
82 |
+
rows.append(formatted_row)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
table = "\n".join(rows)
|
85 |
return f"""
|
|
|
108 |
def format_text_fact_checker_row(
|
109 |
text: TextDetector,
|
110 |
row: list,
|
111 |
+
first_url_row: bool = True,
|
112 |
+
last_url_row: bool = True,
|
113 |
+
span_row: int = 1,
|
114 |
):
|
115 |
entity_count = 0
|
116 |
print(f"row: {row}")
|
|
|
164 |
input_sentence = row[0]["input"]
|
165 |
source_sentence = row[0]["source"]
|
166 |
|
167 |
+
input_sentence = replace_leading_spaces(input_sentence)
|
168 |
+
source_sentence = replace_leading_spaces(source_sentence)
|
169 |
+
|
170 |
url = row[0]["url"]
|
171 |
|
172 |
# Displayed label and score by url
|
173 |
filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
|
174 |
+
|
175 |
if len(filterby_url) > 0:
|
176 |
label = filterby_url["label"].values[0]
|
177 |
score = filterby_url["score"].values[0]
|
|
|
180 |
score = text.prediction_score[0]
|
181 |
|
182 |
# Format displayed url
|
183 |
+
if url is None:
|
184 |
+
source_text_url = url
|
185 |
+
else:
|
186 |
+
source_text_url = f"""<a href="{url}">{url}</a>"""
|
187 |
|
188 |
# Format displayed entity count
|
189 |
entity_count_text = format_entity_count(entity_count)
|
|
|
233 |
"""
|
234 |
|
235 |
|
236 |
+
def format_image_fact_checker_row(image: ImageDetector):
|
237 |
+
if image.input is None:
|
238 |
+
return ""
|
239 |
+
|
240 |
if image.referent_url is not None or image.referent_url != "":
|
241 |
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
|
242 |
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
|
src/application/formatting_governor.py
CHANGED
@@ -7,7 +7,10 @@ from src.application.formatting import (
|
|
7 |
)
|
8 |
from src.application.image.image import ImageDetector
|
9 |
from src.application.text.entity import apply_highlight
|
10 |
-
from src.application.text.helper import
|
|
|
|
|
|
|
11 |
from src.application.text.text import TextDetector
|
12 |
|
13 |
|
@@ -17,32 +20,34 @@ def create_governor_table(
|
|
17 |
image: ImageDetector,
|
18 |
):
|
19 |
rows = []
|
20 |
-
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
)
|
34 |
|
35 |
-
text
|
36 |
-
|
37 |
-
row,
|
38 |
-
equal_idx_1,
|
39 |
-
equal_idx_2,
|
40 |
-
row["entities"],
|
41 |
-
],
|
42 |
-
)
|
43 |
-
|
44 |
-
formatted_row = format_text_governor_row(text)
|
45 |
-
rows.append(formatted_row)
|
46 |
|
47 |
table = "\n".join(rows)
|
48 |
return f"""
|
@@ -123,9 +128,11 @@ def format_text_governor_row(text):
|
|
123 |
source_sentence = row[0]["source"]
|
124 |
input_sentence = row[0]["input"]
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
129 |
|
130 |
url = row[0]["url"]
|
131 |
if url not in urls:
|
@@ -149,6 +156,9 @@ def format_text_governor_row(text):
|
|
149 |
|
150 |
|
151 |
def format_image_governor_row(image):
|
|
|
|
|
|
|
152 |
if image.referent_url is not None or image.referent_url != "":
|
153 |
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
|
154 |
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
|
|
|
7 |
)
|
8 |
from src.application.image.image import ImageDetector
|
9 |
from src.application.text.entity import apply_highlight
|
10 |
+
from src.application.text.helper import (
|
11 |
+
extract_equal_text,
|
12 |
+
replace_leading_spaces,
|
13 |
+
)
|
14 |
from src.application.text.text import TextDetector
|
15 |
|
16 |
|
|
|
20 |
image: ImageDetector,
|
21 |
):
|
22 |
rows = []
|
23 |
+
if image.input is not None:
|
24 |
+
rows.append(format_image_governor_row(image))
|
25 |
|
26 |
+
if text.input is not None:
|
27 |
+
for _, row in aligned_sentences_df.iterrows():
|
28 |
+
if row["input"] is None:
|
29 |
+
continue
|
30 |
|
31 |
+
if row["source"] is None:
|
32 |
+
equal_idx_1 = equal_idx_2 = []
|
33 |
+
else:
|
34 |
+
# Get index of equal phrases in input and source sentences
|
35 |
+
equal_idx_1, equal_idx_2 = extract_equal_text(
|
36 |
+
row["input"],
|
37 |
+
row["source"],
|
38 |
+
)
|
39 |
+
|
40 |
+
text.governor_table.append(
|
41 |
+
[
|
42 |
+
row,
|
43 |
+
equal_idx_1,
|
44 |
+
equal_idx_2,
|
45 |
+
row["entities"],
|
46 |
+
],
|
47 |
)
|
48 |
|
49 |
+
formatted_row = format_text_governor_row(text)
|
50 |
+
rows.append(formatted_row)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
table = "\n".join(rows)
|
53 |
return f"""
|
|
|
128 |
source_sentence = row[0]["source"]
|
129 |
input_sentence = row[0]["input"]
|
130 |
|
131 |
+
input_sentence = replace_leading_spaces(input_sentence)
|
132 |
+
source_sentence = replace_leading_spaces(source_sentence)
|
133 |
+
|
134 |
+
input_sentences += input_sentence + "<br>"
|
135 |
+
source_sentences += source_sentence + "<br>"
|
136 |
|
137 |
url = row[0]["url"]
|
138 |
if url not in urls:
|
|
|
156 |
|
157 |
|
158 |
def format_image_governor_row(image):
|
159 |
+
if image.input is None:
|
160 |
+
return ""
|
161 |
+
|
162 |
if image.referent_url is not None or image.referent_url != "":
|
163 |
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
|
164 |
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
|
src/application/formatting_ordinary_user.py
CHANGED
@@ -2,6 +2,7 @@ from pandas import DataFrame
|
|
2 |
|
3 |
from src.application.config import WORD_BREAK
|
4 |
from src.application.image.image import ImageDetector
|
|
|
5 |
from src.application.text.text import TextDetector
|
6 |
|
7 |
|
@@ -10,9 +11,26 @@ def create_ordinary_user_table(
|
|
10 |
text: TextDetector,
|
11 |
image: ImageDetector,
|
12 |
) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
rows = []
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
table = "\n".join(rows)
|
17 |
|
18 |
return f"""
|
@@ -32,8 +50,6 @@ def create_ordinary_user_table(
|
|
32 |
{table}
|
33 |
</tbody>
|
34 |
</table>
|
35 |
-
|
36 |
-
<style>
|
37 |
"""
|
38 |
|
39 |
|
@@ -41,6 +57,16 @@ def format_text_ordinary_user_row(
|
|
41 |
aligned_sentences_df,
|
42 |
text,
|
43 |
) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
input_sentences = ""
|
45 |
source_text_html = ""
|
46 |
urls = []
|
@@ -48,7 +74,7 @@ def format_text_ordinary_user_row(
|
|
48 |
if row["input"] is None:
|
49 |
continue
|
50 |
|
51 |
-
input_sentences += row["input"] + "<br
|
52 |
url = row["url"]
|
53 |
if url not in urls:
|
54 |
urls.append(url)
|
@@ -66,15 +92,16 @@ def format_text_ordinary_user_row(
|
|
66 |
|
67 |
def format_image_ordinary_user_row(image: ImageDetector) -> str:
|
68 |
"""
|
69 |
-
Formats
|
70 |
-
displaying image analysis results.
|
71 |
|
72 |
Args:
|
73 |
-
image
|
74 |
|
75 |
Returns:
|
76 |
-
|
77 |
"""
|
|
|
|
|
78 |
|
79 |
# Put image, label, and score into html tag
|
80 |
if image.referent_url is not None or image.referent_url != "":
|
|
|
2 |
|
3 |
from src.application.config import WORD_BREAK
|
4 |
from src.application.image.image import ImageDetector
|
5 |
+
from src.application.text.helper import replace_leading_spaces
|
6 |
from src.application.text.text import TextDetector
|
7 |
|
8 |
|
|
|
11 |
text: TextDetector,
|
12 |
image: ImageDetector,
|
13 |
) -> str:
|
14 |
+
"""
|
15 |
+
Creates an HTML table comparing input news with source news
|
16 |
+
for ordinary users.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
aligned_sentences_df (DataFrame): Aligned sentence data.
|
20 |
+
text (TextDetector): Text comparison data.
|
21 |
+
image (ImageDetector): Image comparison data.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
A string representing the HTML table.
|
25 |
+
"""
|
26 |
rows = []
|
27 |
+
|
28 |
+
if image.input is not None:
|
29 |
+
rows.append(format_image_ordinary_user_row(image))
|
30 |
+
|
31 |
+
if text.input is not None:
|
32 |
+
rows.append(format_text_ordinary_user_row(aligned_sentences_df, text))
|
33 |
+
|
34 |
table = "\n".join(rows)
|
35 |
|
36 |
return f"""
|
|
|
50 |
{table}
|
51 |
</tbody>
|
52 |
</table>
|
|
|
|
|
53 |
"""
|
54 |
|
55 |
|
|
|
57 |
aligned_sentences_df,
|
58 |
text,
|
59 |
) -> str:
|
60 |
+
"""
|
61 |
+
Formats a row for the text in the ordinary user table.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
aligned_sentences_df (DataFrame): Aligned sentence data.
|
65 |
+
text (TextDetector): Text comparison data.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
A string representing the HTML table row for the text.
|
69 |
+
"""
|
70 |
input_sentences = ""
|
71 |
source_text_html = ""
|
72 |
urls = []
|
|
|
74 |
if row["input"] is None:
|
75 |
continue
|
76 |
|
77 |
+
input_sentences += replace_leading_spaces(row["input"]) + "<br>"
|
78 |
url = row["url"]
|
79 |
if url not in urls:
|
80 |
urls.append(url)
|
|
|
92 |
|
93 |
def format_image_ordinary_user_row(image: ImageDetector) -> str:
|
94 |
"""
|
95 |
+
Formats a row for the image in the ordinary user table.
|
|
|
96 |
|
97 |
Args:
|
98 |
+
image: Image comparison data.
|
99 |
|
100 |
Returns:
|
101 |
+
A string representing the HTML table row for the image.
|
102 |
"""
|
103 |
+
if image.input is None:
|
104 |
+
return ""
|
105 |
|
106 |
# Put image, label, and score into html tag
|
107 |
if image.referent_url is not None or image.referent_url != "":
|
src/application/image/helper.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
|
3 |
+
|
4 |
+
def encode_image(image_path):
|
5 |
+
with open(image_path, "rb") as img_file:
|
6 |
+
return base64.b64encode(img_file.read()).decode()
|
7 |
+
|
8 |
+
|
9 |
+
image_base64 = encode_image(
|
10 |
+
"/content/ai-generated-picture-of-a-tiger-walking-in-the-forest-photo.jpg",
|
11 |
+
)
|
12 |
+
html_code = f'<img src="data:image/jpeg;base64,{image_base64}" width="300">'
|
src/application/image/image.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
class ImageDetector:
|
2 |
def __init__(self):
|
|
|
3 |
self.referent_url: str = None # URL of the referenced image.
|
4 |
self.prediction_label: str = None
|
5 |
self.prediction_score: float = None
|
|
|
1 |
class ImageDetector:
|
2 |
def __init__(self):
|
3 |
+
self.input = None
|
4 |
self.referent_url: str = None # URL of the referenced image.
|
5 |
self.prediction_label: str = None
|
6 |
self.prediction_score: float = None
|
src/application/text/ai_classification.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import (
|
2 |
+
Dict,
|
3 |
+
List,
|
4 |
+
Tuple,
|
5 |
+
)
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from transformers import (
|
9 |
+
AutoModelForSequenceClassification,
|
10 |
+
AutoTokenizer,
|
11 |
+
)
|
12 |
+
|
13 |
+
from src.application.config import AI_TEXT_CLASSIFICATION_MODEL
|
14 |
+
|
15 |
+
|
16 |
+
def load_model_and_tokenizer(
|
17 |
+
model_path: str = AI_TEXT_CLASSIFICATION_MODEL,
|
18 |
+
) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]:
|
19 |
+
"""
|
20 |
+
Loads the trained model and tokenizer from the specified path.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
model_path: path of directory containing the saved model and tokenizer.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
A tuple containing the loaded tokenizer and model.
|
27 |
+
"""
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
29 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
30 |
+
model.eval()
|
31 |
+
return tokenizer, model
|
32 |
+
|
33 |
+
|
34 |
+
def predict(
|
35 |
+
texts: List[str],
|
36 |
+
model: AutoModelForSequenceClassification,
|
37 |
+
tokenizer: AutoTokenizer,
|
38 |
+
) -> List[Dict[str, str]]:
|
39 |
+
"""
|
40 |
+
Classify on input texts into gpt-4o or gpt-4o-mini.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
texts: A list of input text strings to be classified.
|
44 |
+
model: The loaded model for sequence classification.
|
45 |
+
tokenizer: The loaded tokenizer.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
A list of dictionaries, where each dictionary contains the input text,
|
49 |
+
the predicted label, and the confidence score.
|
50 |
+
"""
|
51 |
+
label_map = {0: "GPT-4o", 1: "GPT-4o mini"}
|
52 |
+
inputs = tokenizer(
|
53 |
+
texts,
|
54 |
+
padding="max_length",
|
55 |
+
truncation=True,
|
56 |
+
return_tensors="pt",
|
57 |
+
)
|
58 |
+
with torch.no_grad():
|
59 |
+
outputs = model(**inputs)
|
60 |
+
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
61 |
+
confidence, predictions = torch.max(probabilities, dim=-1)
|
62 |
+
|
63 |
+
results = []
|
64 |
+
for text, pred, conf in zip(
|
65 |
+
texts,
|
66 |
+
predictions.tolist(),
|
67 |
+
confidence.tolist(),
|
68 |
+
):
|
69 |
+
results.append(
|
70 |
+
{"input": text, "prediction": label_map[pred], "confidence": conf},
|
71 |
+
)
|
72 |
+
|
73 |
+
return results
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
text = """The resignation brings a long political chapter to an end.
|
78 |
+
Trudeau has been in office since 2015, when he brought the Liberals back
|
79 |
+
to power from the political wilderness.
|
80 |
+
"""
|
81 |
+
|
82 |
+
tokenizer, model = load_model_and_tokenizer("ductuan024/gpts-detector")
|
83 |
+
predictions = predict(text, model, tokenizer)
|
84 |
+
|
85 |
+
print(predictions[0]["prediction"])
|
86 |
+
print(predictions[0]["confidence"])
|
src/application/text/helper.py
CHANGED
@@ -15,6 +15,8 @@ from nltk.tokenize import (
|
|
15 |
from nltk.util import ngrams
|
16 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
17 |
|
|
|
|
|
18 |
|
19 |
def clean_text(text: str) -> str:
|
20 |
"""
|
@@ -122,9 +124,7 @@ def get_important_sentences(
|
|
122 |
list: A list of important sentences.
|
123 |
"""
|
124 |
# Clean and split the sentence into sentences
|
125 |
-
sentences = [
|
126 |
-
s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
|
127 |
-
]
|
128 |
|
129 |
# Calculate the importance score for each sentence
|
130 |
sentence_scores = []
|
@@ -293,13 +293,16 @@ def postprocess_label(labels: list[str]) -> str:
|
|
293 |
Returns:
|
294 |
A string with the formatted label.
|
295 |
"""
|
296 |
-
|
297 |
for index, label in enumerate(labels):
|
298 |
-
if label.startswith(
|
299 |
-
|
|
|
|
|
300 |
|
301 |
labels = list(set(labels))
|
302 |
-
|
|
|
303 |
|
304 |
if len(labels) == 1:
|
305 |
label += labels[0]
|
@@ -362,7 +365,7 @@ def split_into_paragraphs(input_text: str) -> list[str]:
|
|
362 |
|
363 |
for paragraph in paragraphs:
|
364 |
# Remove leading/trailing whitespace
|
365 |
-
paragraph = paragraph.strip()
|
366 |
|
367 |
if paragraph and paragraph != "\n":
|
368 |
# Append the cleaned paragraph to the output list.
|
@@ -460,6 +463,33 @@ def filter_indices(
|
|
460 |
return filtered_starts, filtered_ends
|
461 |
|
462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
def extract_new_startend(
|
464 |
start: int,
|
465 |
end: int,
|
|
|
15 |
from nltk.util import ngrams
|
16 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
17 |
|
18 |
+
from src.application.config import PREFIX
|
19 |
+
|
20 |
|
21 |
def clean_text(text: str) -> str:
|
22 |
"""
|
|
|
124 |
list: A list of important sentences.
|
125 |
"""
|
126 |
# Clean and split the sentence into sentences
|
127 |
+
sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s]
|
|
|
|
|
128 |
|
129 |
# Calculate the importance score for each sentence
|
130 |
sentence_scores = []
|
|
|
293 |
Returns:
|
294 |
A string with the formatted label.
|
295 |
"""
|
296 |
+
|
297 |
for index, label in enumerate(labels):
|
298 |
+
# if label.startswith(PREFIX):
|
299 |
+
# labels[index] = label[len(PREFIX) :]
|
300 |
+
if PREFIX in label:
|
301 |
+
labels[index] = label.replace(PREFIX, "")
|
302 |
|
303 |
labels = list(set(labels))
|
304 |
+
|
305 |
+
label = ""
|
306 |
|
307 |
if len(labels) == 1:
|
308 |
label += labels[0]
|
|
|
365 |
|
366 |
for paragraph in paragraphs:
|
367 |
# Remove leading/trailing whitespace
|
368 |
+
# paragraph = paragraph.strip()
|
369 |
|
370 |
if paragraph and paragraph != "\n":
|
371 |
# Append the cleaned paragraph to the output list.
|
|
|
463 |
return filtered_starts, filtered_ends
|
464 |
|
465 |
|
466 |
+
def replace_leading_spaces(text: str) -> str:
|
467 |
+
"""
|
468 |
+
Replaces leading spaces in a string with ' '.
|
469 |
+
|
470 |
+
Args:
|
471 |
+
text: The input string.
|
472 |
+
|
473 |
+
Returns:
|
474 |
+
The string with leading spaces replaced by ' '.
|
475 |
+
"""
|
476 |
+
|
477 |
+
if text is None:
|
478 |
+
return None
|
479 |
+
|
480 |
+
leading_spaces = 0
|
481 |
+
for char in text:
|
482 |
+
if char == " ":
|
483 |
+
leading_spaces += 1
|
484 |
+
else:
|
485 |
+
break
|
486 |
+
|
487 |
+
if leading_spaces > 0:
|
488 |
+
return " " * leading_spaces + text[leading_spaces:]
|
489 |
+
else:
|
490 |
+
return text
|
491 |
+
|
492 |
+
|
493 |
def extract_new_startend(
|
494 |
start: int,
|
495 |
end: int,
|
src/application/text/model_detection.py
CHANGED
@@ -15,8 +15,13 @@ from src.application.config import (
|
|
15 |
HUMAN,
|
16 |
MODEL_HUMAN_LABEL,
|
17 |
PARAPHRASE_MODEL,
|
|
|
18 |
UNKNOWN,
|
19 |
)
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
def detect_text_by_ai_model(
|
@@ -63,7 +68,7 @@ def detect_text_by_ai_model(
|
|
63 |
else:
|
64 |
# label = MACHINE
|
65 |
generated_model, _ = predict_generation_model(input_text)
|
66 |
-
label = f"
|
67 |
|
68 |
return label, confidence_score
|
69 |
|
@@ -75,6 +80,24 @@ def detect_text_by_ai_model(
|
|
75 |
def predict_generation_model(text: str) -> tuple[str, float]:
|
76 |
"""
|
77 |
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
Compares the input text against the paraphrased text by the models.
|
79 |
|
80 |
Args:
|
@@ -82,8 +105,8 @@ def predict_generation_model(text: str) -> tuple[str, float]:
|
|
82 |
|
83 |
Returns:
|
84 |
tuple: (label, confidence_score)
|
85 |
-
|
86 |
-
|
87 |
"""
|
88 |
best_similarity = 0
|
89 |
best_model = GPT_PARAPHRASE_MODELS[0]
|
|
|
15 |
HUMAN,
|
16 |
MODEL_HUMAN_LABEL,
|
17 |
PARAPHRASE_MODEL,
|
18 |
+
PREFIX,
|
19 |
UNKNOWN,
|
20 |
)
|
21 |
+
from src.application.text.ai_classification import (
|
22 |
+
load_model_and_tokenizer,
|
23 |
+
predict,
|
24 |
+
)
|
25 |
|
26 |
|
27 |
def detect_text_by_ai_model(
|
|
|
68 |
else:
|
69 |
# label = MACHINE
|
70 |
generated_model, _ = predict_generation_model(input_text)
|
71 |
+
label = f"{PREFIX}{generated_model}"
|
72 |
|
73 |
return label, confidence_score
|
74 |
|
|
|
80 |
def predict_generation_model(text: str) -> tuple[str, float]:
|
81 |
"""
|
82 |
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
text (str): The input text to be analyzed.
|
86 |
+
|
87 |
+
Returns:
|
88 |
+
tuple: (label, confidence_score)
|
89 |
+
where label is gpt-4o or gpt-4o-mini,
|
90 |
+
and confidence_score is the highest similarity.
|
91 |
+
"""
|
92 |
+
tokenizer, model = load_model_and_tokenizer()
|
93 |
+
predictions = predict(text, model, tokenizer)
|
94 |
+
|
95 |
+
return predictions[0]["prediction"], predictions[0]["confidence"]
|
96 |
+
|
97 |
+
|
98 |
+
def predict_generation_model_by_reparaphrasing(text: str) -> tuple[str, float]:
|
99 |
+
"""
|
100 |
+
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
|
101 |
Compares the input text against the paraphrased text by the models.
|
102 |
|
103 |
Args:
|
|
|
105 |
|
106 |
Returns:
|
107 |
tuple: (label, confidence_score)
|
108 |
+
where label is gpt-4o or gpt-4o-mini,
|
109 |
+
and confidence_score is the highest similarity.
|
110 |
"""
|
111 |
best_similarity = 0
|
112 |
best_model = GPT_PARAPHRASE_MODELS[0]
|
src/application/text/search_detection.py
CHANGED
@@ -3,8 +3,8 @@ Author: Khanh Phan
|
|
3 |
Date: 2024-12-04
|
4 |
"""
|
5 |
|
6 |
-
from typing import Optional
|
7 |
import warnings
|
|
|
8 |
|
9 |
import numpy as np
|
10 |
from pandas import DataFrame
|
@@ -14,6 +14,7 @@ from src.application.config import (
|
|
14 |
DEVICE,
|
15 |
MAX_CHAR_SIZE,
|
16 |
PARAPHRASE_MODEL,
|
|
|
17 |
PARAPHRASE_THRESHOLD_HUMAN,
|
18 |
PARAPHRASE_THRESHOLD_MACHINE,
|
19 |
TOP_URLS_PER_SEARCH,
|
@@ -96,15 +97,22 @@ def find_sentence_source(
|
|
96 |
)
|
97 |
return sentences_df, []
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
for c in columns:
|
109 |
if c in sentences_df.columns:
|
110 |
sentences_df.loc[text_index, c] = aligned_sentence[c]
|
@@ -126,13 +134,22 @@ def find_sentence_source(
|
|
126 |
similarity is None
|
127 |
or aligned_sentence["similarity"] > similarity
|
128 |
):
|
129 |
-
|
130 |
-
"
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
for c in columns:
|
137 |
if c in sentences_df.columns:
|
138 |
sentences_df.loc[idx, c] = aligned_sentence[c]
|
|
|
3 |
Date: 2024-12-04
|
4 |
"""
|
5 |
|
|
|
6 |
import warnings
|
7 |
+
from typing import Optional
|
8 |
|
9 |
import numpy as np
|
10 |
from pandas import DataFrame
|
|
|
14 |
DEVICE,
|
15 |
MAX_CHAR_SIZE,
|
16 |
PARAPHRASE_MODEL,
|
17 |
+
PARAPHRASE_THRESHOLD,
|
18 |
PARAPHRASE_THRESHOLD_HUMAN,
|
19 |
PARAPHRASE_THRESHOLD_MACHINE,
|
20 |
TOP_URLS_PER_SEARCH,
|
|
|
97 |
)
|
98 |
return sentences_df, []
|
99 |
|
100 |
+
if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD:
|
101 |
+
columns = [
|
102 |
+
"input",
|
103 |
+
"source",
|
104 |
+
"label",
|
105 |
+
"similarity",
|
106 |
+
"paraphrase",
|
107 |
+
"url",
|
108 |
+
]
|
109 |
+
else:
|
110 |
+
columns = [
|
111 |
+
"input",
|
112 |
+
"label",
|
113 |
+
"paraphrase",
|
114 |
+
]
|
115 |
+
|
116 |
for c in columns:
|
117 |
if c in sentences_df.columns:
|
118 |
sentences_df.loc[text_index, c] = aligned_sentence[c]
|
|
|
134 |
similarity is None
|
135 |
or aligned_sentence["similarity"] > similarity
|
136 |
):
|
137 |
+
if (
|
138 |
+
aligned_sentence["similarity"]
|
139 |
+
> PARAPHRASE_THRESHOLD
|
140 |
+
):
|
141 |
+
columns = [
|
142 |
+
"input",
|
143 |
+
"source",
|
144 |
+
"label",
|
145 |
+
"similarity",
|
146 |
+
"url",
|
147 |
+
]
|
148 |
+
else:
|
149 |
+
columns = [
|
150 |
+
"input",
|
151 |
+
"label",
|
152 |
+
]
|
153 |
for c in columns:
|
154 |
if c in sentences_df.columns:
|
155 |
sentences_df.loc[idx, c] = aligned_sentence[c]
|
src/application/text/text.py
CHANGED
@@ -3,6 +3,8 @@ import pandas as pd
|
|
3 |
|
4 |
class TextDetector:
|
5 |
def __init__(self):
|
|
|
|
|
6 |
self.prediction_label: list[str] = ["UNKNOWN"]
|
7 |
self.prediction_score: list[float] = [0.0]
|
8 |
|
|
|
3 |
|
4 |
class TextDetector:
|
5 |
def __init__(self):
|
6 |
+
self.input = None
|
7 |
+
|
8 |
self.prediction_label: list[str] = ["UNKNOWN"]
|
9 |
self.prediction_score: list[float] = [0.0]
|
10 |
|
src/application/url_reader.py
CHANGED
@@ -78,7 +78,7 @@ class URLReader:
|
|
78 |
|
79 |
soup = BeautifulSoup(response.content, "html.parser")
|
80 |
|
81 |
-
self.title = soup.title.string
|
82 |
|
83 |
image_urls = [img["src"] for img in soup.find_all("img")]
|
84 |
self.images = image_urls
|
|
|
78 |
|
79 |
soup = BeautifulSoup(response.content, "html.parser")
|
80 |
|
81 |
+
self.title = soup.title.string if soup.title else None
|
82 |
|
83 |
image_urls = [img["src"] for img in soup.find_all("img")]
|
84 |
self.images = image_urls
|
test.py
CHANGED
@@ -1,27 +1,39 @@
|
|
1 |
-
def
|
2 |
"""
|
3 |
-
|
4 |
-
"Partially generated by [label1] and [label2] and ...".
|
5 |
-
Removes duplicate labels while preserving the original order.
|
6 |
|
7 |
Args:
|
8 |
-
|
9 |
|
10 |
Returns:
|
11 |
-
|
12 |
"""
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
19 |
else:
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
23 |
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
print(
|
|
|
|
1 |
+
def replace_leading_spaces(text):
|
2 |
"""
|
3 |
+
Replaces leading spaces in a string with ' '.
|
|
|
|
|
4 |
|
5 |
Args:
|
6 |
+
text: The input string.
|
7 |
|
8 |
Returns:
|
9 |
+
The string with leading spaces replaced by ' '.
|
10 |
"""
|
11 |
+
|
12 |
+
leading_spaces = 0
|
13 |
+
for char in text:
|
14 |
+
if char == " ":
|
15 |
+
leading_spaces += 1
|
16 |
+
else:
|
17 |
+
break
|
18 |
+
|
19 |
+
if leading_spaces > 0:
|
20 |
+
return " " * leading_spaces + text[leading_spaces:]
|
21 |
else:
|
22 |
+
return text
|
23 |
+
|
24 |
+
|
25 |
+
# Example usage:
|
26 |
+
text1 = " Hello, world!"
|
27 |
+
text2 = "No leading spaces."
|
28 |
+
text3 = " Another example."
|
29 |
+
text4 = "\t Test with tabs" # this will not be replaced, only standard spaces
|
30 |
|
31 |
+
result1 = replace_leading_spaces(text1)
|
32 |
+
result2 = replace_leading_spaces(text2)
|
33 |
+
result3 = replace_leading_spaces(text3)
|
34 |
+
result4 = replace_leading_spaces(text4)
|
35 |
|
36 |
+
print(f"'{text1}' becomes '{result1}'")
|
37 |
+
print(f"'{text2}' becomes '{result2}'")
|
38 |
+
print(f"'{text3}' becomes '{result3}'")
|
39 |
+
print(f"'{text4}' becomes '{result4}'")
|