""" Author: Khanh Phan Date: 2024-12-04 """ from openai import OpenAIError from sentence_transformers import util from transformers import pipeline from src.application.config import ( AI_TEXT_DECTECTION_MODEL, AZUREOPENAI_CLIENT, DEVICE, GPT_PARAPHRASE_MODELS, HUMAN, MACHINE, MODEL_HUMAN_LABEL, PARAPHRASE_MODEL, UNKNOWN, ) def detect_text_by_ai_model( input_text: str, model: str = AI_TEXT_DECTECTION_MODEL, max_length: int = 512, ) -> tuple: """ Model: RADAR-Vicuna-7B Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B Detects if text is human or machine generated. Args: input_text (str): The text to be classified. model (str, optional): The name of the AI text detection model. max_length (int, optional): The maximum length of the input text. Returns: tuple: (label, confidence_score) where label is HUMAN or MACHINE. """ try: # Create a text classification pipeline using the specified model. pipe = pipeline( "text-classification", model=model, tokenizer=model, max_length=max_length, # TODO: consider: removal truncation=True, device_map="auto", # good for GPU usage ) # Replace HTML line breaks with spaces to improve processing. input_text = input_text.replace("
", " ") # Perform text classification using the pipeline. result = pipe(input_text)[0] confidence_score = result["score"] # Determine the label based on the model's prediction. if result["label"] == MODEL_HUMAN_LABEL[model]: label = HUMAN else: label = MACHINE generated_model, _ = predict_generation_model(input_text) label += f"
({generated_model})" return label, confidence_score except Exception as e: # Add exception handling print(f"Error in Roberta model inference: {e}") return UNKNOWN, 0.5 # Return UNKNOWN and 0.0 confidence if error def predict_generation_model(text: str) -> tuple[str, float]: """ Predicts if text is generated by gpt-4o or gpt-4o-mini models. Compares the input text against the paraphrased text by the models. Args: text (str): The input text to be analyzed. Returns: tuple: (label, confidence_score) where label is gpt-4o or gpt-4o-mini, and confidence_score is the highest similarity. """ best_similarity = 0 best_model = GPT_PARAPHRASE_MODELS[0] for model in GPT_PARAPHRASE_MODELS: # Generate paraphrased text using the current model. paraphrased_text = paraphrase_by_AI(text, model) # Skip to the next model if paraphrasing fails (returns None). if paraphrased_text is None: continue # Similarity between the original text and the paraphrased text. similarity = measure_text_similarity(text, paraphrased_text) # Update the best similarity if similarity > best_similarity: best_similarity = similarity best_model = model return best_model, best_similarity def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str: """ Paraphrases text using a given AI model. Args: input_text (str): The text to be paraphrased. model (str, optional): The AI model to use for paraphrasing. Returns: str: The paraphrased text, or None if an error occurs. """ prompt = f""" Paraphrase the following news, only output the paraphrased text: {input_text} """ try: response = AZUREOPENAI_CLIENT.chat.completions.create( model=model, messages=[ {"role": "user", "content": prompt}, ], # max_tokens=100, # Limit the number of tokens in the response. # temperature=0.7, # Control the randomness of the response. # top_p=0.9, # Control the nucleus sampling. # n=1, # Generate multiple responses. ) paraphrased_text = response.choices[0].message.content return paraphrased_text except OpenAIError as e: # Add exception handling print(f"Error in AI model inference: {e}") return None def measure_text_similarity(text1: str, text2: str) -> float: """ Measures the similarity between two texts using cosine similarity of their sentence embeddings. Args: text1 (str): The first text string. text2 (str): The second text string. Returns: float: The cosine similarity score between the two texts. """ # Generate sentence embeddings embeddings1 = PARAPHRASE_MODEL.encode( text1, convert_to_tensor=True, device=DEVICE, show_progress_bar=False, ) embeddings2 = PARAPHRASE_MODEL.encode( text2, convert_to_tensor=True, device=DEVICE, show_progress_bar=False, ) # Compute cosine similarity matrix similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy() return similarity[0][0]