"""
Author: Khanh Phan
Date: 2024-12-04
"""
from openai import OpenAIError
from sentence_transformers import util
from transformers import pipeline
from src.application.config import (
AI_TEXT_DECTECTION_MODEL,
AZUREOPENAI_CLIENT,
DEVICE,
GPT_PARAPHRASE_MODELS,
HUMAN,
MACHINE,
MODEL_HUMAN_LABEL,
PARAPHRASE_MODEL,
UNKNOWN,
)
def detect_text_by_ai_model(
input_text: str,
model: str = AI_TEXT_DECTECTION_MODEL,
max_length: int = 512,
) -> tuple:
"""
Model: RADAR-Vicuna-7B
Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B
Detects if text is human or machine generated.
Args:
input_text (str): The text to be classified.
model (str, optional): The name of the AI text detection model.
max_length (int, optional): The maximum length of the input text.
Returns:
tuple: (label, confidence_score)
where label is HUMAN or MACHINE.
"""
try:
# Create a text classification pipeline using the specified model.
pipe = pipeline(
"text-classification",
model=model,
tokenizer=model,
max_length=max_length, # TODO: consider: removal
truncation=True,
device_map="auto", # good for GPU usage
)
# Replace HTML line breaks with spaces to improve processing.
input_text = input_text.replace("
", " ")
# Perform text classification using the pipeline.
result = pipe(input_text)[0]
confidence_score = result["score"]
# Determine the label based on the model's prediction.
if result["label"] == MODEL_HUMAN_LABEL[model]:
label = HUMAN
else:
label = MACHINE
generated_model, _ = predict_generation_model(input_text)
label += f"
({generated_model})"
return label, confidence_score
except Exception as e: # Add exception handling
print(f"Error in Roberta model inference: {e}")
return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error
def predict_generation_model(text: str) -> tuple[str, float]:
"""
Predicts if text is generated by gpt-4o or gpt-4o-mini models.
Compares the input text against the paraphrased text by the models.
Args:
text (str): The input text to be analyzed.
Returns:
tuple: (label, confidence_score)
where label is gpt-4o or gpt-4o-mini,
and confidence_score is the highest similarity.
"""
best_similarity = 0
best_model = GPT_PARAPHRASE_MODELS[0]
for model in GPT_PARAPHRASE_MODELS:
# Generate paraphrased text using the current model.
paraphrased_text = paraphrase_by_AI(text, model)
# Skip to the next model if paraphrasing fails (returns None).
if paraphrased_text is None:
continue
# Similarity between the original text and the paraphrased text.
similarity = measure_text_similarity(text, paraphrased_text)
# Update the best similarity
if similarity > best_similarity:
best_similarity = similarity
best_model = model
return best_model, best_similarity
def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
"""
Paraphrases text using a given AI model.
Args:
input_text (str): The text to be paraphrased.
model (str, optional): The AI model to use for paraphrasing.
Returns:
str: The paraphrased text, or None if an error occurs.
"""
prompt = f"""
Paraphrase the following news, only output the paraphrased text:
{input_text}
"""
try:
response = AZUREOPENAI_CLIENT.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": prompt},
],
# max_tokens=100, # Limit the number of tokens in the response.
# temperature=0.7, # Control the randomness of the response.
# top_p=0.9, # Control the nucleus sampling.
# n=1, # Generate multiple responses.
)
paraphrased_text = response.choices[0].message.content
return paraphrased_text
except OpenAIError as e: # Add exception handling
print(f"Error in AI model inference: {e}")
return None
def measure_text_similarity(text1: str, text2: str) -> float:
"""
Measures the similarity between two texts
using cosine similarity of their sentence embeddings.
Args:
text1 (str): The first text string.
text2 (str): The second text string.
Returns:
float: The cosine similarity score between the two texts.
"""
# Generate sentence embeddings
embeddings1 = PARAPHRASE_MODEL.encode(
text1,
convert_to_tensor=True,
device=DEVICE,
show_progress_bar=False,
)
embeddings2 = PARAPHRASE_MODEL.encode(
text2,
convert_to_tensor=True,
device=DEVICE,
show_progress_bar=False,
)
# Compute cosine similarity matrix
similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
return similarity[0][0]