Spaces:
Sleeping
Sleeping
""" | |
Author: Khanh Phan | |
Date: 2024-12-04 | |
""" | |
import pandas as pd | |
from src.application.config import ( | |
MIN_RATIO_PARAPHRASE_NUM, | |
PARAPHRASE_THRESHOLD, | |
PARAPHRASE_THRESHOLD_MACHINE, | |
) | |
from src.application.formatting_fact_checker import create_fact_checker_table | |
from src.application.formatting_governor import create_governor_table | |
from src.application.formatting_ordinary_user import create_ordinary_user_table | |
from src.application.image.image import ImageDetector | |
from src.application.image.image_detection import ( | |
detect_image_by_ai_model, | |
detect_image_by_reverse_search, | |
detect_image_from_news_image, | |
) | |
from src.application.text.entity import highlight_entities | |
from src.application.text.helper import ( | |
postprocess_label, | |
split_into_paragraphs, | |
) | |
from src.application.text.model_detection import ( | |
detect_text_by_ai_model, | |
predict_generation_model, | |
) | |
from src.application.text.search_detection import find_sentence_source | |
from src.application.text.text import TextDetector | |
class NewsVerification: | |
def __init__(self): | |
""" | |
Initializes the NewsVerification object. | |
""" | |
self.news_text: str = "" | |
self.news_title: str = "" | |
self.news_content: str = "" | |
self.news_image: str = "" | |
self.text = TextDetector() | |
self.image = ImageDetector() | |
self.news_prediction_label: str = "" | |
self.news_prediction_score: float = -1 | |
# news' urls to find img | |
self.found_img_url: list[str] = [] | |
# Analyzed results | |
self.aligned_sentences_df: pd.DataFrame = pd.DataFrame( | |
columns=[ | |
"input", | |
"source", | |
"label", | |
"similarity", | |
"paraphrase", | |
"url", | |
# "entities", | |
], | |
) | |
def load_news(self, news_title: str, news_content: str, news_image: str): | |
""" | |
Loads news data into the object's attributes. | |
Args: | |
news_title (str): The title of the news article. | |
news_content (str): The content of the news article. | |
news_image (str): The url of image in news article. | |
""" | |
# Combine title and content for a full text representation. | |
self.news_text = news_title + "\n\n" + news_content | |
# if not isinstance(news_title, str) or not isinstance( | |
# news_content, | |
# str, | |
# ): | |
# raise TypeError("News title and content must be strings.") | |
# if not isinstance(news_image, str) or news_image is not None: | |
# Warning("News image must be a string.") | |
self.news_title = news_title | |
self.news_content = news_content | |
self.news_image = news_image | |
self.text.input = self.news_text | |
self.image.input = news_image | |
def group_by_url(self): | |
""" | |
Groups aligned sentences by URL | |
Then, concatenates text the 'input' and 'source' text for each group. | |
""" | |
def concat_text(series): | |
""" | |
Concatenates the elements of a pd.Series into a single string. | |
""" | |
return " ".join( | |
series.astype(str).tolist(), | |
) # Handle mixed data types and NaNs | |
# Group sentences by URL and concatenate 'input' and 'source' text. | |
self.text.grouped_url_df = ( | |
self.aligned_sentences_df.groupby("url") | |
.agg( | |
{ | |
"input": concat_text, | |
"source": concat_text, | |
}, | |
) | |
.reset_index() | |
) # Reset index to make 'url' a regular column | |
# Add new columns for label and score | |
self.text.grouped_url_df["label"] = None | |
self.text.grouped_url_df["score"] = None | |
print(f"aligned_sentences_df:\n {self.aligned_sentences_df}") | |
def determine_text_origin_by_url(self): | |
""" | |
Determines the text origin for each URL group. | |
""" | |
for index, row in self.text.grouped_url_df.iterrows(): | |
# Verify text origin using URL-based verification. | |
label, score = self.verify_text(row["url"]) | |
# If URL-based verification returns 'UNKNOWN', use AI detection | |
if label == "UNKNOWN": | |
# Concatenate text from "input" column in sentence_df | |
text = " ".join(row["input"]) | |
# Detect text origin using an AI model. | |
label, score = detect_text_by_ai_model(text) | |
print(f"labels = {label}") | |
self.text.grouped_url_df.at[index, "label"] = label | |
self.text.grouped_url_df.at[index, "score"] = score | |
def determine_text_origin(self): | |
""" | |
Determines the origin of the input text by analyzing | |
its sources and applying AI detection models. | |
This method groups sentences by their source URLs, | |
applies verification and AI detection, and then determines | |
an overall label and score for the input text. | |
""" | |
# Find the text URLs associated with the input text | |
self.find_text_source() | |
# Group sentences by URL and concatenate 'input' and 'source' text. | |
self.group_by_url() | |
# Determine the text origin for each URL group | |
self.determine_text_origin_by_url() | |
# Determine the overall label and score for the entire input text. | |
if not self.text.grouped_url_df.empty: | |
# Check for 'gpt-4o' labels in the grouped URLs. | |
machine_label = self.text.grouped_url_df[ | |
self.text.grouped_url_df["label"].str.contains( | |
"gpt-4o", | |
case=False, | |
na=False, | |
) | |
] | |
print(f" machine_label = {machine_label}") | |
if not machine_label.empty: | |
# If 'gpt-4o' labels are found, post-process and assign. | |
labels = machine_label["label"].tolist() | |
label = postprocess_label(labels) | |
# labels = " and ".join(machine_label["label"].tolist()) | |
# label = remove_duplicate_words(label) | |
self.text.prediction_label[0] = label | |
self.text.prediction_score[0] = machine_label["score"].mean() | |
else: | |
# If no 'gpt-4o' labels, assign for 'HUMAN' labels. | |
machine_label = self.aligned_sentences_df[ | |
self.aligned_sentences_df["label"] == "HUMAN" | |
] | |
self.text.prediction_label[0] = "HUMAN" | |
self.text.prediction_score[0] = self.text.grouped_url_df[ | |
"score" | |
].mean() | |
else: | |
# If no found URLs, use AI detection on the entire input text. | |
print("No source found in the input text") | |
text = " ".join(self.aligned_sentences_df["input"].tolist()) | |
# Detect text origin using an AI model. | |
label, score = detect_text_by_ai_model(text) | |
self.text.prediction_label[0] = label | |
self.text.prediction_score[0] = score | |
def find_text_source(self): | |
""" | |
Determines the origin of the given text based on paraphrasing | |
detection and human authorship analysis. | |
1. Splits the input news text into sentences, | |
2. Searches for sources for each sentence | |
3. Updates the aligned_sentences_df with the found sources. | |
""" | |
print("CHECK TEXT:") | |
print("\tFrom search engine:") | |
input_paragraphs = split_into_paragraphs(self.news_text) | |
# Initialize an empty DataFrame if it doesn't exist, | |
# otherwise extend it. | |
if ( | |
not hasattr(self, "aligned_sentences_df") | |
or self.aligned_sentences_df is None | |
): | |
self.aligned_sentences_df = pd.DataFrame( | |
columns=[ | |
"input", | |
"source", | |
"label", | |
"similarity", | |
"paraphrase", | |
"url", | |
"entities", | |
], | |
) | |
# Setup DataFrame for input_sentences | |
for _ in range(len(input_paragraphs)): | |
self.aligned_sentences_df = pd.concat( | |
[ | |
self.aligned_sentences_df, | |
pd.DataFrame( | |
[ | |
{ | |
"input": None, | |
"source": None, | |
"label": None, | |
"similarity": None, | |
"paraphrase": None, | |
"url": None, | |
"entities": None, | |
}, | |
], | |
), | |
], | |
ignore_index=True, | |
) | |
# Find a source for each sentence | |
for index, _ in enumerate(input_paragraphs): | |
similarity = self.aligned_sentences_df.loc[index, "similarity"] | |
if similarity is not None: | |
if similarity > PARAPHRASE_THRESHOLD_MACHINE: | |
continue | |
print(f"\n-------index = {index}-------") | |
print(f"current_text = {input_paragraphs[index]}\n") | |
self.aligned_sentences_df, img_urls = find_sentence_source( | |
input_paragraphs, | |
index, | |
self.aligned_sentences_df, | |
) | |
# Initialize found_img_url if it does not exist. | |
if not hasattr(self, "found_img_url"): | |
self.found_img_url = [] | |
self.found_img_url.extend(img_urls) | |
def verify_text(self, url): | |
""" | |
Verifies the text origin based on similarity scores and labels | |
associated with a given URL. | |
1. Filters sentences by URL and similarity score, | |
2. Determines if the text is likely generated by a machine or a human. | |
3. Calculates an average similarity score. | |
Args: | |
url (str): The URL to filter sentences by. | |
Returns: | |
tuple: A | |
- Label ("MACHINE", "HUMAN", or "UNKNOWN") | |
- Score | |
""" | |
label = "UNKNOWN" | |
score = 0 | |
# calculate the average similarity when the similary score | |
# in each row of sentences_df is higher than 0.8 | |
# Filter sentences by URL. | |
filtered_by_url = self.aligned_sentences_df[ | |
self.aligned_sentences_df["url"] == url | |
] | |
# Filter sentences by similarity score (> PARAPHRASE_THRESHOLD). | |
filtered_by_similarity = filtered_by_url[ | |
filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD | |
] | |
# Check if a ratio of remaining filtering-sentences is more than 50%. | |
if ( | |
len(filtered_by_similarity) / len(filtered_by_url) | |
> MIN_RATIO_PARAPHRASE_NUM | |
): | |
# check if "MACHINE" is in self.aligned_sentences_df["label"]: | |
contains_machine = ( | |
filtered_by_similarity["label"] | |
.str.contains( | |
"MACHINE", | |
case=False, | |
na=False, | |
) | |
.any() | |
) | |
print(f"contain_machine = \n{contains_machine}") | |
# TODO: integrate with determine_text_origin | |
if contains_machine: | |
# If "MACHINE" label is present, set label and calculate score. | |
machine_rows = filtered_by_similarity[ | |
filtered_by_similarity["label"].str.contains( | |
"MACHINE", | |
case=False, | |
na=False, | |
) | |
] | |
generated_model, _ = predict_generation_model(self.news_text) | |
label = f"Partially generated by {generated_model}" | |
score = machine_rows["similarity"].mean() | |
else: | |
# If no "MACHINE" label, | |
# assign "HUMAN" label and calculate score. | |
label = "HUMAN" | |
human_rows = filtered_by_similarity[ | |
filtered_by_similarity["label"].str.contains( | |
"HUMAN", | |
case=False, | |
na=False, | |
) | |
] | |
score = human_rows["similarity"].mean() | |
return label, score | |
def determine_image_origin(self): | |
""" | |
Determines the origin of the news image using 3 detection methods. | |
1. Matching against previously found image URLs. | |
2. Reverse image search. | |
3. AI-based image detection. | |
If none of these methods succeed, the image origin is "UNKNOWN". | |
""" | |
print("CHECK IMAGE:") | |
# Handle the case where no image is provided. | |
if self.news_image is None: | |
self.image.prediction_label = "UNKNOWN" | |
self.image.prediction_score = 0.0 | |
self.image.referent_url = None | |
return | |
# Attempt to match the image against previously found image URLs. | |
print("\tFrom found image URLs...") | |
matched_url, similarity = detect_image_from_news_image( | |
self.news_image, | |
self.found_img_url, | |
) | |
if matched_url is not None: | |
print(f"matched image: {matched_url}\nsimilarity: {similarity}\n") | |
self.image.prediction_label = "HUMAN" | |
self.image.prediction_score = similarity | |
self.image.referent_url = matched_url | |
return | |
# Attempt to find the image origin using reverse image search. | |
print("\tFrom reverse image search...") | |
matched_url, similarity = detect_image_by_reverse_search( | |
self.news_image, | |
) | |
if matched_url is not None: | |
print(f"matched image: {matched_url}\tScore: {similarity}%\n") | |
self.image.prediction_label = "HUMAN" | |
self.image.prediction_score = similarity | |
self.image.referent_url = matched_url | |
return | |
# Attempt to detect the image origin using an AI model. | |
print("\tFrom an AI model...") | |
detected_label, score = detect_image_by_ai_model(self.news_image) | |
if detected_label: | |
print(f"detected_label: {detected_label} ({score})") | |
self.image.prediction_label = detected_label | |
self.image.prediction_score = score | |
self.image.referent_url = None | |
return | |
# If all detection methods fail, mark the image origin as "UNKNOWN". | |
self.image.prediction_label = "UNKNOWN" | |
self.image.prediction_score = 50 | |
self.image.referent_url = None | |
def determine_origin(self): | |
""" | |
Determine origins by analyzing the news text and image. | |
""" | |
if self.news_text != "": | |
self.determine_text_origin() | |
if self.news_image != "": | |
self.determine_image_origin() | |
# Handle entity recognition and processing. | |
self.handle_entities() | |
def generate_report(self) -> tuple[str, str, str]: | |
""" | |
Generates reports tailored for different user roles | |
(ordinary users, fact checkers, governors). | |
Returns: | |
tuple: A tuple containing three html-formatted reports: | |
- ordinary_user_table: Report for ordinary users. | |
- fact_checker_table: Report for fact checkers. | |
- governor_table: Report for governors. | |
""" | |
ordinary_user_table = create_ordinary_user_table( | |
self.aligned_sentences_df, | |
self.text, | |
self.image, | |
) | |
fact_checker_table = create_fact_checker_table( | |
self.aligned_sentences_df, | |
self.text, | |
self.image, | |
) | |
governor_table = create_governor_table( | |
self.aligned_sentences_df, | |
self.text, | |
self.image, | |
) | |
return ordinary_user_table, fact_checker_table, governor_table | |
def handle_entities(self): | |
""" | |
Highlights and assigns entities with colors to aligned sentences | |
based on grouped URLs. | |
For each grouped URL: | |
1. Highlights entities in the input and source text | |
2. Then assigns these highlighted entities to the corresponding | |
sentences in the aligned sentences DataFrame. | |
""" | |
entities_with_colors = [] | |
for index, row in self.text.grouped_url_df.iterrows(): | |
# Get entity-words (in pair) with colors | |
entities_with_colors = highlight_entities( | |
row["input"], | |
row["source"], | |
) | |
# Assign the highlighted entities to the corresponding sentences | |
# in aligned_sentences_df. | |
for index, sentence in self.aligned_sentences_df.iterrows(): | |
if sentence["url"] == row["url"]: | |
# Use .at to modify the DataFrame efficiently. | |
self.aligned_sentences_df.at[index, "entities"] = ( | |
entities_with_colors | |
) | |
def get_text_urls(self) -> set: | |
""" | |
Returns a set of unique URLs referenced in the text analysis. | |
Returns: | |
set: A set containing the unique URLs referenced in the text. | |
""" | |
return set(self.text_referent_url) | |