"""
Author: Khanh Phan
Date: 2024-12-04
"""

from openai import OpenAIError
from sentence_transformers import util
from transformers import pipeline

from src.application.config import (
    AI_TEXT_DECTECTION_MODEL,
    AZUREOPENAI_CLIENT,
    DEVICE,
    GPT_PARAPHRASE_MODELS,
    HUMAN,
    MACHINE,
    MODEL_HUMAN_LABEL,
    PARAPHRASE_MODEL,
    UNKNOWN,
)


def detect_text_by_ai_model(
    input_text: str,
    model: str = AI_TEXT_DECTECTION_MODEL,
    max_length: int = 512,
) -> tuple:
    """
    Model: RADAR-Vicuna-7B
    Ref: https://huggingface.co/TrustSafeAI/RADAR-Vicuna-7B

    Detects if text is human or machine generated.

    Args:
        input_text (str): The text to be classified.
        model (str, optional): The name of the AI text detection model.
        max_length (int, optional): The maximum length of the input text.

    Returns:
        tuple: (label, confidence_score)
            where label is HUMAN or MACHINE.
    """
    try:
        # Create a text classification pipeline using the specified model.
        pipe = pipeline(
            "text-classification",
            model=model,
            tokenizer=model,
            max_length=max_length,  # TODO: consider: removal
            truncation=True,
            device_map="auto",  # good for GPU usage
        )

        # Replace HTML line breaks with spaces to improve processing.
        input_text = input_text.replace("<br>", " ")

        # Perform text classification using the pipeline.
        result = pipe(input_text)[0]
        confidence_score = result["score"]

        # Determine the label based on the model's prediction.
        if result["label"] == MODEL_HUMAN_LABEL[model]:
            label = HUMAN
        else:
            label = MACHINE
            generated_model, _ = predict_generation_model(input_text)
            label += f"<br>({generated_model})"

        return label, confidence_score

    except Exception as e:  # Add exception handling
        print(f"Error in Roberta model inference: {e}")
        return UNKNOWN, 0.5  # Return UNKNOWN and 0.0 confidence if error


def predict_generation_model(text: str) -> tuple[str, float]:
    """
    Predicts if text is generated by gpt-4o or gpt-4o-mini models.
    Compares the input text against the paraphrased text by the models.

    Args:
        text (str): The input text to be analyzed.

    Returns:
        tuple: (label, confidence_score)
               where label is gpt-4o or gpt-4o-mini,
               and confidence_score is the highest similarity.
    """
    best_similarity = 0
    best_model = GPT_PARAPHRASE_MODELS[0]

    for model in GPT_PARAPHRASE_MODELS:
        # Generate paraphrased text using the current model.
        paraphrased_text = paraphrase_by_AI(text, model)

        # Skip to the next model if paraphrasing fails (returns None).
        if paraphrased_text is None:
            continue

        # Similarity between the original text and the paraphrased text.
        similarity = measure_text_similarity(text, paraphrased_text)

        # Update the best similarity
        if similarity > best_similarity:
            best_similarity = similarity
            best_model = model

    return best_model, best_similarity


def paraphrase_by_AI(input_text: str, model: str = "gpt-4o-mini") -> str:
    """
    Paraphrases text using a given AI model.

    Args:
        input_text (str): The text to be paraphrased.
        model (str, optional): The AI model to use for paraphrasing.

    Returns:
        str: The paraphrased text, or None if an error occurs.
    """

    prompt = f"""
Paraphrase the following news, only output the paraphrased text:
{input_text}
"""
    try:
        response = AZUREOPENAI_CLIENT.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt},
            ],
            # max_tokens=100,  # Limit the number of tokens in the response.
            # temperature=0.7,  # Control the randomness of the response.
            # top_p=0.9,  # Control the nucleus sampling.
            # n=1,  # Generate multiple responses.
        )
        paraphrased_text = response.choices[0].message.content
        return paraphrased_text

    except OpenAIError as e:  # Add exception handling
        print(f"Error in AI model inference: {e}")
        return None


def measure_text_similarity(text1: str, text2: str) -> float:
    """
    Measures the similarity between two texts
        using cosine similarity of their sentence embeddings.

    Args:
        text1 (str): The first text string.
        text2 (str): The second text string.

    Returns:
        float: The cosine similarity score between the two texts.
    """
    # Generate sentence embeddings
    embeddings1 = PARAPHRASE_MODEL.encode(
        text1,
        convert_to_tensor=True,
        device=DEVICE,
        show_progress_bar=False,
    )
    embeddings2 = PARAPHRASE_MODEL.encode(
        text2,
        convert_to_tensor=True,
        device=DEVICE,
        show_progress_bar=False,
    )

    # Compute cosine similarity matrix
    similarity = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
    return similarity[0][0]