Spaces:
Sleeping
Sleeping
""" | |
Author: Khanh Phan | |
Date: 2024-12-04 | |
""" | |
from openai import OpenAIError | |
from sentence_transformers import util | |
from transformers import pipeline | |
from src.application.config import ( | |
AI_TEXT_DECTECTION_MODEL, | |
AZUREOPENAI_CLIENT, | |
DEVICE, | |
GPT_PARAPHRASE_MODELS, | |
HUMAN, | |
MACHINE, | |
MODEL_HUMAN_LABEL, | |
PARAPHRASE_MODEL, | |
UNKNOWN, | |
) | |
def detect_text_by_ai_model( | |
input_text: str, | |
model: str = AI_TEXT_DECTECTION_MODEL, | |
max_length: int = 512, | |
) -> tuple: | |
""" | |
Model: RADAR-Vicuna-7B | |
Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B | |
Detects if text is human or machine generated. | |
Args: | |
input_text (str): The text to be classified. | |
model (str, optional): The name of the AI text detection model. | |
max_length (int, optional): The maximum length of the input text. | |
Returns: | |
tuple: (label, confidence_score) | |
where label is HUMAN or MACHINE. | |
""" | |
try: | |
# Create a text classification pipeline using the specified model. | |
pipe = pipeline( | |
"text-classification", | |
model=model, | |
tokenizer=model, | |
max_length=max_length, # TODO: consider: removal | |
truncation=True, | |
device_map="auto", # good for GPU usage | |
) | |
# Replace HTML line breaks with spaces to improve processing. | |
input_text = input_text.replace("<br>", " ") | |
# Perform text classification using the pipeline. | |
result = pipe(input_text)[0] | |
confidence_score = result["score"] | |
# Determine the label based on the model's prediction. | |
if result["label"] == MODEL_HUMAN_LABEL[model]: | |
label = HUMAN | |
else: | |
label = MACHINE | |
generated_model, _ = predict_generation_model(input_text) | |
label += f"<br>({generated_model})" | |
return label, confidence_score | |
except Exception as e: # Add exception handling | |
print(f"Error in Roberta model inference: {e}") | |
return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error | |
def predict_generation_model(text: str) -> tuple[str, float]: | |
""" | |
Predicts if text is generated by gpt-4o or gpt-4o-mini models. | |
Compares the input text against the paraphrased text by the models. | |
Args: | |
text (str): The input text to be analyzed. | |
Returns: | |
tuple: (label, confidence_score) | |
where label is gpt-4o or gpt-4o-mini, | |
and confidence_score is the highest similarity. | |
""" | |
best_similarity = 0 | |
best_model = GPT_PARAPHRASE_MODELS[0] | |
for model in GPT_PARAPHRASE_MODELS: | |
# Generate paraphrased text using the current model. | |
paraphrased_text = paraphrase_by_AI(text, model) | |
# Skip to the next model if paraphrasing fails (returns None). | |
if paraphrased_text is None: | |
continue | |
# Similarity between the original text and the paraphrased text. | |
similarity = measure_text_similarity(text, paraphrased_text) | |
# Update the best similarity | |
if similarity > best_similarity: | |
best_similarity = similarity | |
best_model = model | |
return best_model, best_similarity | |
def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str: | |
""" | |
Paraphrases text using a given AI model. | |
Args: | |
input_text (str): The text to be paraphrased. | |
model (str, optional): The AI model to use for paraphrasing. | |
Returns: | |
str: The paraphrased text, or None if an error occurs. | |
""" | |
prompt = f""" | |
Paraphrase the following news, only output the paraphrased text: | |
{input_text} | |
""" | |
try: | |
response = AZUREOPENAI_CLIENT.chat.completions.create( | |
model=model, | |
messages=[ | |
{"role": "user", "content": prompt}, | |
], | |
# max_tokens=100, # Limit the number of tokens in the response. | |
# temperature=0.7, # Control the randomness of the response. | |
# top_p=0.9, # Control the nucleus sampling. | |
# n=1, # Generate multiple responses. | |
) | |
paraphrased_text = response.choices[0].message.content | |
return paraphrased_text | |
except OpenAIError as e: # Add exception handling | |
print(f"Error in AI model inference: {e}") | |
return None | |
def measure_text_similarity(text1: str, text2: str) -> float: | |
""" | |
Measures the similarity between two texts | |
using cosine similarity of their sentence embeddings. | |
Args: | |
text1 (str): The first text string. | |
text2 (str): The second text string. | |
Returns: | |
float: The cosine similarity score between the two texts. | |
""" | |
# Generate sentence embeddings | |
embeddings1 = PARAPHRASE_MODEL.encode( | |
text1, | |
convert_to_tensor=True, | |
device=DEVICE, | |
show_progress_bar=False, | |
) | |
embeddings2 = PARAPHRASE_MODEL.encode( | |
text2, | |
convert_to_tensor=True, | |
device=DEVICE, | |
show_progress_bar=False, | |
) | |
# Compute cosine similarity matrix | |
similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy() | |
return similarity[0][0] | |