Spaces:
Running
Running
Commit
·
a5e8d12
1
Parent(s):
0260491
Separate each row by paragraph
Browse files- gpt_test.py +3 -3
- src/application/content_detection.py +46 -51
- src/application/text/entity.py +3 -3
- src/application/text/helper.py +9 -9
- src/application/text/highlight_text.py +5 -5
- src/application/text/preprocessing.py +24 -1
- src/application/text/search_detection.py +34 -26
- test.py +71 -11
gpt_test.py
CHANGED
@@ -96,12 +96,12 @@ azure_client = AzureOpenAI(
|
|
96 |
api_version="2024-05-01-preview",
|
97 |
)
|
98 |
|
99 |
-
deplopment_name = "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
100 |
TEXT_PROMPT = """
|
101 |
Paraphrase the following news, only output the paraphrased text:
|
102 |
|
103 |
"""
|
104 |
-
text = get_first_column("data/
|
105 |
count = 0
|
106 |
for index, news in enumerate(text):
|
107 |
if count > 1000:
|
@@ -127,4 +127,4 @@ for index, news in enumerate(text):
|
|
127 |
count += 1
|
128 |
paraphrased_news = response.choices[0].message.content
|
129 |
|
130 |
-
add_text_to_csv("data/
|
|
|
96 |
api_version="2024-05-01-preview",
|
97 |
)
|
98 |
|
99 |
+
deplopment_name = "gpt-4o" # or "gpt-4o-mini" # "o1-mini" # or "gpt-4o"
|
100 |
TEXT_PROMPT = """
|
101 |
Paraphrase the following news, only output the paraphrased text:
|
102 |
|
103 |
"""
|
104 |
+
text = get_first_column("data/MAGE_2.csv")
|
105 |
count = 0
|
106 |
for index, news in enumerate(text):
|
107 |
if count > 1000:
|
|
|
127 |
count += 1
|
128 |
paraphrased_news = response.choices[0].message.content
|
129 |
|
130 |
+
add_text_to_csv("data/MAGE_2_4o.csv", paraphrased_news, count)
|
src/application/content_detection.py
CHANGED
@@ -16,10 +16,10 @@ from src.application.text.model_detection import (
|
|
16 |
detect_text_by_ai_model,
|
17 |
predict_generation_model,
|
18 |
)
|
19 |
-
from src.application.text.preprocessing import split_into_paragraphs
|
20 |
from src.application.text.search_detection import (
|
21 |
PARAPHRASE_THRESHOLD_MACHINE,
|
22 |
-
|
23 |
)
|
24 |
|
25 |
|
@@ -44,7 +44,7 @@ class NewsVerification:
|
|
44 |
self.found_img_url: list[str] = []
|
45 |
|
46 |
# Analyzed results
|
47 |
-
self.
|
48 |
columns=[
|
49 |
"input",
|
50 |
"source",
|
@@ -78,7 +78,7 @@ class NewsVerification:
|
|
78 |
series.astype(str).tolist(),
|
79 |
) # Handle mixed data types and NaNs
|
80 |
|
81 |
-
self.grouped_url_df = self.
|
82 |
{
|
83 |
"input": concat_text,
|
84 |
"source": concat_text,
|
@@ -89,7 +89,7 @@ class NewsVerification:
|
|
89 |
self.grouped_url_df["label"] = None
|
90 |
self.grouped_url_df["score"] = None
|
91 |
|
92 |
-
print(f"
|
93 |
|
94 |
for index, row in self.grouped_url_df.iterrows():
|
95 |
label, score = self.verify_text(row["url"])
|
@@ -112,22 +112,20 @@ class NewsVerification:
|
|
112 |
na=False,
|
113 |
)
|
114 |
]
|
115 |
-
|
116 |
-
# self.aligned_paragraphs_df["label"] == "MACHINE"
|
117 |
-
# ]
|
118 |
if len(machine_label) > 0:
|
119 |
label = " ".join(machine_label["label"].tolist())
|
120 |
self.text_prediction_label[0] = label
|
121 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
122 |
else:
|
123 |
-
machine_label = self.
|
124 |
-
self.
|
125 |
]
|
126 |
self.text_prediction_label[0] = "HUMAN"
|
127 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
128 |
else: # no source found in the input text
|
129 |
print("No source found in the input text")
|
130 |
-
text = " ".join(self.
|
131 |
# detect by baseline model
|
132 |
label, score = detect_text_by_ai_model(text)
|
133 |
self.text_prediction_label[0] = label
|
@@ -149,14 +147,15 @@ class NewsVerification:
|
|
149 |
print("CHECK TEXT:")
|
150 |
print("\tFrom search engine:")
|
151 |
# Classify by search engine
|
152 |
-
input_sentences =
|
|
|
153 |
|
154 |
# Setup df for input_sentences
|
155 |
|
156 |
-
for _ in range(len(
|
157 |
-
self.
|
158 |
[
|
159 |
-
self.
|
160 |
pd.DataFrame(
|
161 |
[
|
162 |
{
|
@@ -174,20 +173,20 @@ class NewsVerification:
|
|
174 |
ignore_index=True,
|
175 |
)
|
176 |
|
177 |
-
# find a source for each
|
178 |
-
for index, _ in enumerate(
|
179 |
-
similarity = self.
|
180 |
if similarity is not None:
|
181 |
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
|
182 |
continue
|
183 |
|
184 |
print(f"\n-------index = {index}-------")
|
185 |
-
print(f"current_text = {
|
186 |
|
187 |
-
self.
|
188 |
-
|
189 |
index,
|
190 |
-
self.
|
191 |
)
|
192 |
|
193 |
self.found_img_url.extend(img_urls)
|
@@ -199,13 +198,13 @@ class NewsVerification:
|
|
199 |
score = 0
|
200 |
# calculate the average similarity when the similary score
|
201 |
# in each row of sentences_df is higher than 0.8
|
202 |
-
filtered_by_url = self.
|
203 |
-
self.
|
204 |
]
|
205 |
filtered_by_similarity = filtered_by_url[
|
206 |
filtered_by_url["similarity"] > 0.8
|
207 |
]
|
208 |
-
if len(filtered_by_similarity) / len(self.
|
209 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
210 |
contains_machine = (
|
211 |
filtered_by_similarity["label"]
|
@@ -305,9 +304,9 @@ class NewsVerification:
|
|
305 |
row["source"],
|
306 |
)
|
307 |
|
308 |
-
for index,
|
309 |
-
if
|
310 |
-
self.
|
311 |
entities_with_colors # must use at
|
312 |
)
|
313 |
|
@@ -353,10 +352,9 @@ class NewsVerification:
|
|
353 |
|
354 |
def create_fact_checker_table(self):
|
355 |
rows = []
|
356 |
-
|
357 |
-
rows.append(self.format_image_fact_checker_row(max_length))
|
358 |
|
359 |
-
for _, row in self.
|
360 |
if row["input"] is None:
|
361 |
continue
|
362 |
|
@@ -404,6 +402,8 @@ class NewsVerification:
|
|
404 |
if span_row == 1:
|
405 |
last_url_row = True
|
406 |
|
|
|
|
|
407 |
formatted_row = self.format_text_fact_checker_row(
|
408 |
row,
|
409 |
first_url_row,
|
@@ -555,7 +555,7 @@ class NewsVerification:
|
|
555 |
</tr>
|
556 |
"""
|
557 |
|
558 |
-
def format_image_fact_checker_row(self
|
559 |
|
560 |
if (
|
561 |
self.image_referent_url is not None
|
@@ -577,9 +577,8 @@ class NewsVerification:
|
|
577 |
|
578 |
def create_ordinary_user_table(self):
|
579 |
rows = []
|
580 |
-
|
581 |
-
rows.append(self.
|
582 |
-
rows.append(self.format_text_ordinary_user_row(max_length))
|
583 |
table = "\n".join(rows)
|
584 |
|
585 |
return f"""
|
@@ -607,7 +606,7 @@ class NewsVerification:
|
|
607 |
input_sentences = ""
|
608 |
source_text_urls = ""
|
609 |
urls = []
|
610 |
-
for _, row in self.
|
611 |
if row["input"] is None:
|
612 |
continue
|
613 |
input_sentences += row["input"] + "<br><br>"
|
@@ -641,16 +640,14 @@ class NewsVerification:
|
|
641 |
|
642 |
def create_governor_table(self):
|
643 |
rows = []
|
644 |
-
|
645 |
-
rows.append(self.format_image_governor_row(max_length))
|
646 |
|
647 |
-
for _, row in self.
|
648 |
if row["input"] is None:
|
649 |
continue
|
650 |
|
651 |
if row["source"] is None:
|
652 |
equal_idx_1 = equal_idx_2 = []
|
653 |
-
|
654 |
else:
|
655 |
# Get index of equal phrases in input and source sentences
|
656 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
@@ -667,7 +664,7 @@ class NewsVerification:
|
|
667 |
],
|
668 |
)
|
669 |
|
670 |
-
formatted_row = self.format_text_governor_row(
|
671 |
rows.append(formatted_row)
|
672 |
|
673 |
table = "\n".join(rows)
|
@@ -694,7 +691,7 @@ class NewsVerification:
|
|
694 |
<style>
|
695 |
"""
|
696 |
|
697 |
-
def format_text_governor_row(self
|
698 |
input_sentences = ""
|
699 |
source_sentences = ""
|
700 |
source_text_urls = ""
|
@@ -705,9 +702,7 @@ class NewsVerification:
|
|
705 |
if row[0]["input"] is None:
|
706 |
continue
|
707 |
|
708 |
-
if
|
709 |
-
row[0]["source"] is not None and row[3] is not None
|
710 |
-
): # source is not empty
|
711 |
# highlight entities
|
712 |
input_sentence, highlight_idx_input = apply_highlight(
|
713 |
row[0]["input"],
|
@@ -779,7 +774,7 @@ class NewsVerification:
|
|
779 |
</tr>
|
780 |
"""
|
781 |
|
782 |
-
def format_image_governor_row(self
|
783 |
if (
|
784 |
self.image_referent_url is not None
|
785 |
or self.image_referent_url != ""
|
@@ -803,7 +798,7 @@ class NewsVerification:
|
|
803 |
return entity_count_text
|
804 |
|
805 |
def color_text(self, text, colored_idx, highlighted_idx):
|
806 |
-
|
807 |
words = text.split()
|
808 |
|
809 |
starts, ends = self.extract_starts_ends(colored_idx)
|
@@ -811,16 +806,16 @@ class NewsVerification:
|
|
811 |
|
812 |
previous_end = 0
|
813 |
for start, end in zip(starts, ends):
|
814 |
-
|
815 |
|
816 |
equal_words = " ".join(words[start:end])
|
817 |
-
|
818 |
|
819 |
previous_end = end
|
820 |
|
821 |
-
|
822 |
|
823 |
-
return
|
824 |
|
825 |
def extract_starts_ends(self, colored_idx):
|
826 |
starts = []
|
|
|
16 |
detect_text_by_ai_model,
|
17 |
predict_generation_model,
|
18 |
)
|
19 |
+
from src.application.text.preprocessing import split_into_paragraphs, split_into_sentences
|
20 |
from src.application.text.search_detection import (
|
21 |
PARAPHRASE_THRESHOLD_MACHINE,
|
22 |
+
find_sentence_source,
|
23 |
)
|
24 |
|
25 |
|
|
|
44 |
self.found_img_url: list[str] = []
|
45 |
|
46 |
# Analyzed results
|
47 |
+
self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
|
48 |
columns=[
|
49 |
"input",
|
50 |
"source",
|
|
|
78 |
series.astype(str).tolist(),
|
79 |
) # Handle mixed data types and NaNs
|
80 |
|
81 |
+
self.grouped_url_df = self.aligned_sentences_df.groupby("url").agg(
|
82 |
{
|
83 |
"input": concat_text,
|
84 |
"source": concat_text,
|
|
|
89 |
self.grouped_url_df["label"] = None
|
90 |
self.grouped_url_df["score"] = None
|
91 |
|
92 |
+
print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
|
93 |
|
94 |
for index, row in self.grouped_url_df.iterrows():
|
95 |
label, score = self.verify_text(row["url"])
|
|
|
112 |
na=False,
|
113 |
)
|
114 |
]
|
115 |
+
|
|
|
|
|
116 |
if len(machine_label) > 0:
|
117 |
label = " ".join(machine_label["label"].tolist())
|
118 |
self.text_prediction_label[0] = label
|
119 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
120 |
else:
|
121 |
+
machine_label = self.aligned_sentences_df[
|
122 |
+
self.aligned_sentences_df["label"] == "HUMAN"
|
123 |
]
|
124 |
self.text_prediction_label[0] = "HUMAN"
|
125 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
126 |
else: # no source found in the input text
|
127 |
print("No source found in the input text")
|
128 |
+
text = " ".join(self.aligned_sentences_df["input"].tolist())
|
129 |
# detect by baseline model
|
130 |
label, score = detect_text_by_ai_model(text)
|
131 |
self.text_prediction_label[0] = label
|
|
|
147 |
print("CHECK TEXT:")
|
148 |
print("\tFrom search engine:")
|
149 |
# Classify by search engine
|
150 |
+
#input_sentences = split_into_sentences(self.news_text)
|
151 |
+
input_paragraphs = split_into_paragraphs(self.news_text)
|
152 |
|
153 |
# Setup df for input_sentences
|
154 |
|
155 |
+
for _ in range(len(input_paragraphs)):
|
156 |
+
self.aligned_sentences_df = pd.concat(
|
157 |
[
|
158 |
+
self.aligned_sentences_df,
|
159 |
pd.DataFrame(
|
160 |
[
|
161 |
{
|
|
|
173 |
ignore_index=True,
|
174 |
)
|
175 |
|
176 |
+
# find a source for each sentence
|
177 |
+
for index, _ in enumerate(input_paragraphs):
|
178 |
+
similarity = self.aligned_sentences_df.loc[index, "similarity"]
|
179 |
if similarity is not None:
|
180 |
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
|
181 |
continue
|
182 |
|
183 |
print(f"\n-------index = {index}-------")
|
184 |
+
print(f"current_text = {input_paragraphs[index]}\n")
|
185 |
|
186 |
+
self.aligned_sentences_df, img_urls = find_sentence_source(
|
187 |
+
input_paragraphs,
|
188 |
index,
|
189 |
+
self.aligned_sentences_df,
|
190 |
)
|
191 |
|
192 |
self.found_img_url.extend(img_urls)
|
|
|
198 |
score = 0
|
199 |
# calculate the average similarity when the similary score
|
200 |
# in each row of sentences_df is higher than 0.8
|
201 |
+
filtered_by_url = self.aligned_sentences_df[
|
202 |
+
self.aligned_sentences_df["url"] == url
|
203 |
]
|
204 |
filtered_by_similarity = filtered_by_url[
|
205 |
filtered_by_url["similarity"] > 0.8
|
206 |
]
|
207 |
+
if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 0.5:
|
208 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
209 |
contains_machine = (
|
210 |
filtered_by_similarity["label"]
|
|
|
304 |
row["source"],
|
305 |
)
|
306 |
|
307 |
+
for index, sentence in self.aligned_sentences_df.iterrows():
|
308 |
+
if sentence["url"] == row["url"]:
|
309 |
+
self.aligned_sentences_df.at[index, "entities"] = (
|
310 |
entities_with_colors # must use at
|
311 |
)
|
312 |
|
|
|
352 |
|
353 |
def create_fact_checker_table(self):
|
354 |
rows = []
|
355 |
+
rows.append(self.format_image_fact_checker_row())
|
|
|
356 |
|
357 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
358 |
if row["input"] is None:
|
359 |
continue
|
360 |
|
|
|
402 |
if span_row == 1:
|
403 |
last_url_row = True
|
404 |
|
405 |
+
# end_of_paragraph = is_newline_after_text(row[0]["input"], self.news_content)
|
406 |
+
|
407 |
formatted_row = self.format_text_fact_checker_row(
|
408 |
row,
|
409 |
first_url_row,
|
|
|
555 |
</tr>
|
556 |
"""
|
557 |
|
558 |
+
def format_image_fact_checker_row(self):
|
559 |
|
560 |
if (
|
561 |
self.image_referent_url is not None
|
|
|
577 |
|
578 |
def create_ordinary_user_table(self):
|
579 |
rows = []
|
580 |
+
rows.append(self.format_image_ordinary_user_row())
|
581 |
+
rows.append(self.format_text_ordinary_user_row())
|
|
|
582 |
table = "\n".join(rows)
|
583 |
|
584 |
return f"""
|
|
|
606 |
input_sentences = ""
|
607 |
source_text_urls = ""
|
608 |
urls = []
|
609 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
610 |
if row["input"] is None:
|
611 |
continue
|
612 |
input_sentences += row["input"] + "<br><br>"
|
|
|
640 |
|
641 |
def create_governor_table(self):
|
642 |
rows = []
|
643 |
+
rows.append(self.format_image_governor_row())
|
|
|
644 |
|
645 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
646 |
if row["input"] is None:
|
647 |
continue
|
648 |
|
649 |
if row["source"] is None:
|
650 |
equal_idx_1 = equal_idx_2 = []
|
|
|
651 |
else:
|
652 |
# Get index of equal phrases in input and source sentences
|
653 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
|
|
664 |
],
|
665 |
)
|
666 |
|
667 |
+
formatted_row = self.format_text_governor_row()
|
668 |
rows.append(formatted_row)
|
669 |
|
670 |
table = "\n".join(rows)
|
|
|
691 |
<style>
|
692 |
"""
|
693 |
|
694 |
+
def format_text_governor_row(self):
|
695 |
input_sentences = ""
|
696 |
source_sentences = ""
|
697 |
source_text_urls = ""
|
|
|
702 |
if row[0]["input"] is None:
|
703 |
continue
|
704 |
|
705 |
+
if row[0]["source"] is not None: # source is not empty
|
|
|
|
|
706 |
# highlight entities
|
707 |
input_sentence, highlight_idx_input = apply_highlight(
|
708 |
row[0]["input"],
|
|
|
774 |
</tr>
|
775 |
"""
|
776 |
|
777 |
+
def format_image_governor_row(self):
|
778 |
if (
|
779 |
self.image_referent_url is not None
|
780 |
or self.image_referent_url != ""
|
|
|
798 |
return entity_count_text
|
799 |
|
800 |
def color_text(self, text, colored_idx, highlighted_idx):
|
801 |
+
sentence = ""
|
802 |
words = text.split()
|
803 |
|
804 |
starts, ends = self.extract_starts_ends(colored_idx)
|
|
|
806 |
|
807 |
previous_end = 0
|
808 |
for start, end in zip(starts, ends):
|
809 |
+
sentence += " ".join(words[previous_end:start])
|
810 |
|
811 |
equal_words = " ".join(words[start:end])
|
812 |
+
sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
813 |
|
814 |
previous_end = end
|
815 |
|
816 |
+
sentence += " ".join(words[previous_end:])
|
817 |
|
818 |
+
return sentence
|
819 |
|
820 |
def extract_starts_ends(self, colored_idx):
|
821 |
starts = []
|
src/application/text/entity.py
CHANGED
@@ -362,14 +362,14 @@ set to take office on Monday, could potentially reduce aid.
|
|
362 |
"""
|
363 |
if __name__ == "__main__":
|
364 |
with gr.Blocks() as demo:
|
365 |
-
gr.Markdown("### Highlight Matching Parts Between Two
|
366 |
text1_input = gr.Textbox(
|
367 |
-
label="
|
368 |
lines=5,
|
369 |
value=original_text,
|
370 |
)
|
371 |
text2_input = gr.Textbox(
|
372 |
-
label="
|
373 |
lines=5,
|
374 |
value=compared_text,
|
375 |
)
|
|
|
362 |
"""
|
363 |
if __name__ == "__main__":
|
364 |
with gr.Blocks() as demo:
|
365 |
+
gr.Markdown("### Highlight Matching Parts Between Two Texts")
|
366 |
text1_input = gr.Textbox(
|
367 |
+
label="Text 1",
|
368 |
lines=5,
|
369 |
value=original_text,
|
370 |
)
|
371 |
text2_input = gr.Textbox(
|
372 |
+
label="Text 2",
|
373 |
lines=5,
|
374 |
value=compared_text,
|
375 |
)
|
src/application/text/helper.py
CHANGED
@@ -61,7 +61,7 @@ def get_keywords(text, num_keywords=5):
|
|
61 |
|
62 |
|
63 |
def get_important_sentences(
|
64 |
-
|
65 |
keywords: list[str],
|
66 |
num_sentences: int = 3,
|
67 |
) -> list[str]:
|
@@ -69,16 +69,16 @@ def get_important_sentences(
|
|
69 |
Selects important sentences based on a list of keywords.
|
70 |
|
71 |
Args:
|
72 |
-
|
73 |
keywords (list[str]): List of important keywords.
|
74 |
num_sentences (int): Number of sentences to return (default is 3).
|
75 |
|
76 |
Returns:
|
77 |
list: A list of important sentences.
|
78 |
"""
|
79 |
-
# Clean and split the
|
80 |
sentences = [
|
81 |
-
s.strip() for s in re.split(r"(?<=[.!?])\s+",
|
82 |
]
|
83 |
|
84 |
# Calculate the importance score for each sentence
|
@@ -103,7 +103,7 @@ def get_important_sentences(
|
|
103 |
|
104 |
|
105 |
def extract_important_phrases(
|
106 |
-
|
107 |
keywords: list[str],
|
108 |
phrase_length: int = 5,
|
109 |
) -> list[str]:
|
@@ -112,20 +112,20 @@ def extract_important_phrases(
|
|
112 |
Phrase length is auto-determined, and overlapped parts are less than 20%.
|
113 |
|
114 |
Args:
|
115 |
-
|
116 |
keywords (list[str]): List of important keywords.
|
117 |
phrase_length (int): Length of phrases to extract (default: 5 words).
|
118 |
|
119 |
Returns:
|
120 |
list: A list of important phrases.
|
121 |
"""
|
122 |
-
# Tokenize the
|
123 |
-
words = word_tokenize(
|
124 |
|
125 |
# Determine phrase length (between 3 and 7 words)
|
126 |
phrase_length = min(max(len(words) // 10, 5), 7)
|
127 |
|
128 |
-
# Generate n-grams (phrases) from the
|
129 |
phrases = list(ngrams(words, phrase_length))
|
130 |
|
131 |
important_phrases = []
|
|
|
61 |
|
62 |
|
63 |
def get_important_sentences(
|
64 |
+
sentence: str,
|
65 |
keywords: list[str],
|
66 |
num_sentences: int = 3,
|
67 |
) -> list[str]:
|
|
|
69 |
Selects important sentences based on a list of keywords.
|
70 |
|
71 |
Args:
|
72 |
+
sentence (str): The input sentence.
|
73 |
keywords (list[str]): List of important keywords.
|
74 |
num_sentences (int): Number of sentences to return (default is 3).
|
75 |
|
76 |
Returns:
|
77 |
list: A list of important sentences.
|
78 |
"""
|
79 |
+
# Clean and split the sentence into sentences
|
80 |
sentences = [
|
81 |
+
s.strip() for s in re.split(r"(?<=[.!?])\s+", sentence) if s.strip()
|
82 |
]
|
83 |
|
84 |
# Calculate the importance score for each sentence
|
|
|
103 |
|
104 |
|
105 |
def extract_important_phrases(
|
106 |
+
text: str,
|
107 |
keywords: list[str],
|
108 |
phrase_length: int = 5,
|
109 |
) -> list[str]:
|
|
|
112 |
Phrase length is auto-determined, and overlapped parts are less than 20%.
|
113 |
|
114 |
Args:
|
115 |
+
text (str): The input text.
|
116 |
keywords (list[str]): List of important keywords.
|
117 |
phrase_length (int): Length of phrases to extract (default: 5 words).
|
118 |
|
119 |
Returns:
|
120 |
list: A list of important phrases.
|
121 |
"""
|
122 |
+
# Tokenize the text into words
|
123 |
+
words = word_tokenize(text.lower())
|
124 |
|
125 |
# Determine phrase length (between 3 and 7 words)
|
126 |
phrase_length = min(max(len(words) // 10, 5), 7)
|
127 |
|
128 |
+
# Generate n-grams (phrases) from the text
|
129 |
phrases = list(ngrams(words, phrase_length))
|
130 |
|
131 |
important_phrases = []
|
src/application/text/highlight_text.py
CHANGED
@@ -57,7 +57,7 @@ def generate_color(index, total_colors=20):
|
|
57 |
|
58 |
|
59 |
def highlight_pairs(text1, text2):
|
60 |
-
"""Highlight matching pairs between two
|
61 |
# Predefined matching pairs
|
62 |
match_pairs = [
|
63 |
{
|
@@ -145,7 +145,7 @@ def highlight_pairs(text1, text2):
|
|
145 |
highlighted_text += text[prev_end:]
|
146 |
return highlighted_text
|
147 |
|
148 |
-
# Apply highlighting to both
|
149 |
highlighted_text1 = apply_highlight(
|
150 |
text1,
|
151 |
match_pairs,
|
@@ -171,9 +171,9 @@ if __name__ == "__main__":
|
|
171 |
text1 = ""
|
172 |
|
173 |
with gr.Blocks() as demo:
|
174 |
-
gr.Markdown("### Highlight Matching Parts Between Two
|
175 |
text1_input = gr.Textbox(
|
176 |
-
label="
|
177 |
lines=5,
|
178 |
value="""
|
179 |
The field of deep learning is advancing rapidly.
|
@@ -181,7 +181,7 @@ Modern neural networks are improving AI research significantly.
|
|
181 |
""",
|
182 |
)
|
183 |
text2_input = gr.Textbox(
|
184 |
-
label="
|
185 |
lines=5,
|
186 |
value="""
|
187 |
Advances in deep learning have led to breakthroughs in AI research.
|
|
|
57 |
|
58 |
|
59 |
def highlight_pairs(text1, text2):
|
60 |
+
"""Highlight matching pairs between two texts"""
|
61 |
# Predefined matching pairs
|
62 |
match_pairs = [
|
63 |
{
|
|
|
145 |
highlighted_text += text[prev_end:]
|
146 |
return highlighted_text
|
147 |
|
148 |
+
# Apply highlighting to both texts using the global MATCH_PAIRS
|
149 |
highlighted_text1 = apply_highlight(
|
150 |
text1,
|
151 |
match_pairs,
|
|
|
171 |
text1 = ""
|
172 |
|
173 |
with gr.Blocks() as demo:
|
174 |
+
gr.Markdown("### Highlight Matching Parts Between Two texts")
|
175 |
text1_input = gr.Textbox(
|
176 |
+
label="Text 1",
|
177 |
lines=5,
|
178 |
value="""
|
179 |
The field of deep learning is advancing rapidly.
|
|
|
181 |
""",
|
182 |
)
|
183 |
text2_input = gr.Textbox(
|
184 |
+
label="Text 2",
|
185 |
lines=5,
|
186 |
value="""
|
187 |
Advances in deep learning have led to breakthroughs in AI research.
|
src/application/text/preprocessing.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
3 |
|
4 |
-
def
|
5 |
"""
|
6 |
Splits input text into sentences by newlines.
|
7 |
|
@@ -21,3 +21,26 @@ def split_into_paragraphs(input_text):
|
|
21 |
if paragraph and paragraph != "\n":
|
22 |
sentences.extend(sent_tokenize(paragraph))
|
23 |
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
3 |
|
4 |
+
def split_into_sentences(input_text):
|
5 |
"""
|
6 |
Splits input text into sentences by newlines.
|
7 |
|
|
|
21 |
if paragraph and paragraph != "\n":
|
22 |
sentences.extend(sent_tokenize(paragraph))
|
23 |
return sentences
|
24 |
+
|
25 |
+
|
26 |
+
def split_into_paragraphs(input_text):
|
27 |
+
"""
|
28 |
+
Splits input text into sentences by newlines.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
input_text: The input text as a string.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
A list of sentences. Returns an empty list if input is not valid.
|
35 |
+
"""
|
36 |
+
if not isinstance(input_text, str):
|
37 |
+
return []
|
38 |
+
|
39 |
+
paragraphs = input_text.splitlines(keepends=True)
|
40 |
+
out_paragraphs = []
|
41 |
+
for paragraph in paragraphs:
|
42 |
+
paragraph = paragraph.strip()
|
43 |
+
if paragraph and paragraph != "\n":
|
44 |
+
out_paragraphs.append(paragraph)
|
45 |
+
print(f"paragraphs: {out_paragraphs}")
|
46 |
+
return out_paragraphs
|
src/application/text/search_detection.py
CHANGED
@@ -9,7 +9,7 @@ from sentence_transformers import (
|
|
9 |
util,
|
10 |
)
|
11 |
|
12 |
-
from src.application.text.preprocessing import
|
13 |
from src.application.text.search import (
|
14 |
generate_search_phrases,
|
15 |
search_by_google,
|
@@ -38,7 +38,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.5
|
|
38 |
MAX_CHAR_SIZE = 30000
|
39 |
|
40 |
|
41 |
-
def
|
42 |
|
43 |
checked_urls = set()
|
44 |
searched_phrases = generate_search_phrases(text[text_index])
|
@@ -63,14 +63,14 @@ def find_paragraph_source(text, text_index, sentences_df):
|
|
63 |
print("\t\t\t↑↑↑ Title or text not found")
|
64 |
continue
|
65 |
|
66 |
-
|
67 |
-
if len(
|
68 |
print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
|
69 |
continue
|
70 |
print(f"\t\t\t↑↑↑ Title: {content.title}")
|
71 |
aligned_sentence = check_paraphrase(
|
72 |
text[text_index],
|
73 |
-
|
74 |
url,
|
75 |
)
|
76 |
|
@@ -105,7 +105,7 @@ def find_paragraph_source(text, text_index, sentences_df):
|
|
105 |
# find matched content in new url
|
106 |
aligned_sentence = check_paraphrase(
|
107 |
text[idx],
|
108 |
-
|
109 |
url,
|
110 |
)
|
111 |
|
@@ -222,7 +222,7 @@ def check_sentence(
|
|
222 |
return False
|
223 |
|
224 |
|
225 |
-
def check_paraphrase(input_text,
|
226 |
"""
|
227 |
Checks if the input text is paraphrased in the content at the given URL.
|
228 |
|
@@ -237,30 +237,30 @@ def check_paraphrase(input_text, page_text, url):
|
|
237 |
"""
|
238 |
|
239 |
# Extract sentences from input text and web page
|
240 |
-
|
241 |
|
242 |
-
if not
|
243 |
return {}
|
244 |
|
245 |
-
|
246 |
-
if not
|
247 |
return {}
|
248 |
|
249 |
additional_sentences = []
|
250 |
-
for sentence in
|
251 |
if ", external" in sentence:
|
252 |
additional_sentences.append(sentence.replace(", external", ""))
|
253 |
-
|
254 |
|
255 |
# Encode sentences into embeddings
|
256 |
embeddings1 = PARAPHASE_MODEL.encode(
|
257 |
-
|
258 |
convert_to_tensor=True,
|
259 |
device=DEVICE,
|
260 |
show_progress_bar=False,
|
261 |
)
|
262 |
embeddings2 = PARAPHASE_MODEL.encode(
|
263 |
-
|
264 |
convert_to_tensor=True,
|
265 |
device=DEVICE,
|
266 |
show_progress_bar=False,
|
@@ -270,23 +270,31 @@ def check_paraphrase(input_text, page_text, url):
|
|
270 |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
271 |
|
272 |
# Find sentence alignments
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
275 |
max_sim_index = np.argmax(similarity_matrix[i])
|
276 |
max_similarity = similarity_matrix[i][max_sim_index]
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
285 |
"label": label,
|
286 |
"paraphrase": is_paraphrased,
|
287 |
"url": url,
|
288 |
}
|
289 |
-
|
290 |
|
291 |
return alignment
|
292 |
|
|
|
9 |
util,
|
10 |
)
|
11 |
|
12 |
+
from src.application.text.preprocessing import split_into_sentences
|
13 |
from src.application.text.search import (
|
14 |
generate_search_phrases,
|
15 |
search_by_google,
|
|
|
38 |
MAX_CHAR_SIZE = 30000
|
39 |
|
40 |
|
41 |
+
def find_sentence_source(text, text_index, sentences_df):
|
42 |
|
43 |
checked_urls = set()
|
44 |
searched_phrases = generate_search_phrases(text[text_index])
|
|
|
63 |
print("\t\t\t↑↑↑ Title or text not found")
|
64 |
continue
|
65 |
|
66 |
+
source_text = content.title + "\n" + content.text
|
67 |
+
if len(source_text) > MAX_CHAR_SIZE:
|
68 |
print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
|
69 |
continue
|
70 |
print(f"\t\t\t↑↑↑ Title: {content.title}")
|
71 |
aligned_sentence = check_paraphrase(
|
72 |
text[text_index],
|
73 |
+
source_text,
|
74 |
url,
|
75 |
)
|
76 |
|
|
|
105 |
# find matched content in new url
|
106 |
aligned_sentence = check_paraphrase(
|
107 |
text[idx],
|
108 |
+
source_text,
|
109 |
url,
|
110 |
)
|
111 |
|
|
|
222 |
return False
|
223 |
|
224 |
|
225 |
+
def check_paraphrase(input_text, source_text, url):
|
226 |
"""
|
227 |
Checks if the input text is paraphrased in the content at the given URL.
|
228 |
|
|
|
237 |
"""
|
238 |
|
239 |
# Extract sentences from input text and web page
|
240 |
+
input_sentences = split_into_sentences(input_text)
|
241 |
|
242 |
+
if not source_text:
|
243 |
return {}
|
244 |
|
245 |
+
source_sentences = split_into_sentences(source_text)
|
246 |
+
if not input_sentences or not source_sentences:
|
247 |
return {}
|
248 |
|
249 |
additional_sentences = []
|
250 |
+
for sentence in source_sentences:
|
251 |
if ", external" in sentence:
|
252 |
additional_sentences.append(sentence.replace(", external", ""))
|
253 |
+
source_sentences.extend(additional_sentences)
|
254 |
|
255 |
# Encode sentences into embeddings
|
256 |
embeddings1 = PARAPHASE_MODEL.encode(
|
257 |
+
input_sentences,
|
258 |
convert_to_tensor=True,
|
259 |
device=DEVICE,
|
260 |
show_progress_bar=False,
|
261 |
)
|
262 |
embeddings2 = PARAPHASE_MODEL.encode(
|
263 |
+
source_sentences,
|
264 |
convert_to_tensor=True,
|
265 |
device=DEVICE,
|
266 |
show_progress_bar=False,
|
|
|
270 |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
271 |
|
272 |
# Find sentence alignments
|
273 |
+
inputs = ""
|
274 |
+
sources = ""
|
275 |
+
similarities = []
|
276 |
+
|
277 |
+
for i, sentence in enumerate(input_sentences):
|
278 |
max_sim_index = np.argmax(similarity_matrix[i])
|
279 |
max_similarity = similarity_matrix[i][max_sim_index]
|
280 |
+
best_matched_sentence = source_sentences[max_sim_index]
|
281 |
+
|
282 |
+
inputs += sentence + " "
|
283 |
+
sources += best_matched_sentence + " "
|
284 |
+
similarities.append(max_similarity)
|
285 |
+
|
286 |
+
|
287 |
+
similarity = sum(similarities) / len(similarities)
|
288 |
+
label, is_paraphrased = determine_label(max_similarity)
|
289 |
+
alignment = {
|
290 |
+
"input": inputs,
|
291 |
+
"source": sources,
|
292 |
+
"similarity": similarity,
|
293 |
"label": label,
|
294 |
"paraphrase": is_paraphrased,
|
295 |
"url": url,
|
296 |
}
|
297 |
+
print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
|
298 |
|
299 |
return alignment
|
300 |
|
test.py
CHANGED
@@ -1,14 +1,74 @@
|
|
1 |
-
import
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
1 |
+
import re
|
2 |
|
3 |
+
def is_newline_after_text(text1, text2):
|
4 |
+
"""
|
5 |
+
Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
text1: The text to search for.
|
9 |
+
text2: The text to search within.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
A tuple: (True/False if text1 is found, True/False if next char is newline, or None if not found)
|
13 |
+
"""
|
14 |
+
|
15 |
+
match = re.search(re.escape(text1), text2) #escape text1 to handle special characters
|
16 |
+
|
17 |
+
if match:
|
18 |
+
# Find the next non-space character
|
19 |
+
next_char_index = match.end()
|
20 |
+
while next_char_index < len(text2) and text2[next_char_index].isspace():
|
21 |
+
next_char_index += 1
|
22 |
+
|
23 |
+
if text2[next_char_index:next_char_index+2] == r'\n':
|
24 |
+
print("newline found")
|
25 |
+
if next_char_index < len(text2) and text2[next_char_index:next_char_index+2] == r'\n':
|
26 |
+
return True
|
27 |
+
|
28 |
+
return False
|
29 |
+
|
30 |
+
def is_newline_after_text_2(text1, text2):
|
31 |
+
"""
|
32 |
+
Checks if text1 is in text2 and if the next non-space character after text1 is a newline.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
text1: The text to search for.
|
36 |
+
text2: The text to search within.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
True if next char is newline
|
40 |
+
"""
|
41 |
+
text2 = text2.replace("\n", "\\n")
|
42 |
|
43 |
+
ater_text = text2.split(text1)
|
44 |
+
if len(ater_text) > 1:
|
45 |
+
ater_text = ater_text[1].lstrip() # Remove spaces
|
46 |
+
if ater_text.startswith('\n'):
|
47 |
+
return True
|
48 |
+
return False
|
49 |
+
|
50 |
+
# Example usage:
|
51 |
+
text1 = "hello"
|
52 |
+
text2 = "some text hello \nmore text"
|
53 |
+
result = is_newline_after_text_2(text1, text2)
|
54 |
+
print(f"Next char is newline: {result}\n")
|
55 |
+
|
56 |
+
text1 = "hello"
|
57 |
+
text2 = "some text hello more text"
|
58 |
+
result = is_newline_after_text_2(text1, text2)
|
59 |
+
print(f"Next char is newline: {result}\n")
|
60 |
+
|
61 |
+
text1 = "hello"
|
62 |
+
text2 = "some text hello \nmore text"
|
63 |
+
result = is_newline_after_text_2(text1, text2)
|
64 |
+
print(f"Next char is newline: {result}\n")
|
65 |
+
|
66 |
+
text1 = "hello"
|
67 |
+
text2 = "some text hello\t\nmore text" #test tab space before newline
|
68 |
+
result = is_newline_after_text_2(text1, text2)
|
69 |
+
print(f"Next char is newline: {result}\n")
|
70 |
|
71 |
+
text1 = "hello." #test special characters
|
72 |
+
text2 = "some text hello. \nmore text"
|
73 |
+
result = is_newline_after_text_2(text1, text2)
|
74 |
+
print(f"Next char is newline: {result}\n")
|