Spaces:

gabrielchua
/

refactored-guacamole

Sleeping

File size: 6,214 Bytes

import os
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any

# --- New Inference Code Components ---

# Define categories with sub-level information
CATEGORIES = {
    'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'],
    'insults': ['insults'],
    'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'],
    'physical_violence': ['physical_violence'],
    'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'],
    'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal']
}

def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
    """
    Generate embeddings for a list of texts using the OpenAI API synchronously.
    
    Args:
        texts: List of strings to embed.
        model: The OpenAI embedding model to use.
    
    Returns:
        A numpy array of embeddings.
    """
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    MAX_TOKENS = 8191  # Maximum tokens for the embedding model
    truncated_texts = [text[:MAX_TOKENS] for text in texts]
    
    response = client.embeddings.create(
        input=truncated_texts,
        model=model
    )
    
    embeddings = np.array([data.embedding for data in response.data])
    return embeddings

def run_model(model_file: str, embeddings: np.ndarray):
    """
    Run the model on the embeddings.
    
    Args:
        model_file: Path to the model file.
        embeddings: Numpy array of embeddings.
    
    Returns:
        expanded_predictions, expanded_probabilities, expanded_label_names
    """
    print("Loading model...")
    model_data = joblib.load(model_file)
    model = model_data['model']
    label_names = model_data['label_names']
    
    print("Predicting...")
    # raw_predictions is a list of arrays with shape (n_samples, 2)
    raw_predictions = model.predict(embeddings)
    
    print("Processing predictions...")
    predictions = []
    probabilities = []
    # Process each category's raw predictions
    for i, pred in enumerate(raw_predictions):
        # Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
        pred_class = np.zeros(len(pred))
        pred_class += (pred[:, 0] > 0.5).astype(int)  # y > 0
        pred_class += (pred[:, 1] > 0.5).astype(int)  # y > 1
        predictions.append(pred_class)
        
        # Calculate probabilities for each class:
        # P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
        prob = np.zeros((len(pred), 3))
        prob[:, 0] = 1 - pred[:, 0]
        prob[:, 1] = pred[:, 0] - pred[:, 1]
        prob[:, 2] = pred[:, 1]
        probabilities.append(prob)
    
    predictions = np.array(predictions).T
    probabilities = np.array(probabilities).transpose(1, 0, 2)
    
    # Expand predictions to sub-levels
    expanded_predictions = []
    expanded_probabilities = []
    expanded_label_names = []
    for i, cat in enumerate(label_names):
        # Level 1 binary 
        y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2
        y_proba_l1 = 1 - probabilities[:, i, 0]  # 1 - P(class 0)
        
        # Level 2 binary 
        y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2
        y_proba_l2 = probabilities[:, i, 2]  # Probability of class 2

        if cat in ['binary', 'insults', 'physical_violence']:
            expanded_predictions.append(y_pred_l1)
            expanded_probabilities.append(y_proba_l1)
            expanded_label_names.append(cat)
        else:
            expanded_predictions.append(y_pred_l1)
            expanded_probabilities.append(y_proba_l1)
            expanded_label_names.append(CATEGORIES[cat][0])
            
            expanded_predictions.append(y_pred_l2)
            expanded_probabilities.append(y_proba_l2)
            expanded_label_names.append(CATEGORIES[cat][1])
    
    expanded_predictions = np.array(expanded_predictions).T
    expanded_probabilities = np.array(expanded_probabilities).T
    
    return expanded_predictions, expanded_probabilities, expanded_label_names

def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
    """
    Format the output predictions into a DataFrame.
    
    Args:
        predictions: Binary predictions.
        probabilities: Associated prediction scores.
        label_names: List of label names.
    
    Returns:
        DataFrame with columns "Label", "Prediction", and "Score".
    """
    # As our Gradio interface processes one text at a time, we use the first (and only) sample.
    data = {
        "Label": label_names,
        "Prediction": predictions[0].tolist(),
        "Score": np.round(probabilities[0], 4).tolist()
    }
    return pd.DataFrame(data)

# --- Gradio App Integration ---

# Define model file path (adjust as necessary)
MODEL_FILE = "model.joblib"

def classify_text(text: str):
    """
    Given an input text, generates embeddings, runs the model inference,
    and returns a DataFrame of classification results.
    """
    if not text.strip():
        # Return an empty DataFrame if no text provided
        empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
        return gr.update(value=empty_df, visible=True)
    
    # Obtain embeddings (input must be a list)
    embeddings = get_embeddings([text])
    
    # Run inference on the embeddings using the new model file
    predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
    
    # Format the results to a DataFrame that Gradio can display
    df = format_output(predictions, probabilities, label_names)
    return gr.update(value=df, visible=True)

with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
    input_text = gr.Textbox(lines=5, label="Input Text")
    submit_btn = gr.Button("Submit")
    output_table = gr.DataFrame(label="Classification Results", visible=False)
    
    submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table)

if __name__ == "__main__":
    iface.launch()