import os import gradio as gr import joblib import numpy as np import pandas as pd from openai import OpenAI from typing import List, Dict, Any # --- New Inference Code Components --- # Define categories with sub-level information CATEGORIES = { 'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'], 'insults': ['insults'], 'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'], 'physical_violence': ['physical_violence'], 'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'], 'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal'] } def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray: """ Generate embeddings for a list of texts using the OpenAI API synchronously. Args: texts: List of strings to embed. model: The OpenAI embedding model to use. Returns: A numpy array of embeddings. """ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) MAX_TOKENS = 8191 # Maximum tokens for the embedding model truncated_texts = [text[:MAX_TOKENS] for text in texts] response = client.embeddings.create( input=truncated_texts, model=model ) embeddings = np.array([data.embedding for data in response.data]) return embeddings def run_model(model_file: str, embeddings: np.ndarray): """ Run the model on the embeddings. Args: model_file: Path to the model file. embeddings: Numpy array of embeddings. Returns: expanded_predictions, expanded_probabilities, expanded_label_names """ print("Loading model...") model_data = joblib.load(model_file) model = model_data['model'] label_names = model_data['label_names'] print("Predicting...") # raw_predictions is a list of arrays with shape (n_samples, 2) raw_predictions = model.predict(embeddings) print("Processing predictions...") predictions = [] probabilities = [] # Process each category's raw predictions for i, pred in enumerate(raw_predictions): # Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2} pred_class = np.zeros(len(pred)) pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0 pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1 predictions.append(pred_class) # Calculate probabilities for each class: # P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1) prob = np.zeros((len(pred), 3)) prob[:, 0] = 1 - pred[:, 0] prob[:, 1] = pred[:, 0] - pred[:, 1] prob[:, 2] = pred[:, 1] probabilities.append(prob) predictions = np.array(predictions).T probabilities = np.array(probabilities).transpose(1, 0, 2) # Expand predictions to sub-levels expanded_predictions = [] expanded_probabilities = [] expanded_label_names = [] for i, cat in enumerate(label_names): # Level 1 binary y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2 y_proba_l1 = 1 - probabilities[:, i, 0] # 1 - P(class 0) # Level 2 binary y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2 y_proba_l2 = probabilities[:, i, 2] # Probability of class 2 if cat in ['binary', 'insults', 'physical_violence']: expanded_predictions.append(y_pred_l1) expanded_probabilities.append(y_proba_l1) expanded_label_names.append(cat) else: expanded_predictions.append(y_pred_l1) expanded_probabilities.append(y_proba_l1) expanded_label_names.append(CATEGORIES[cat][0]) expanded_predictions.append(y_pred_l2) expanded_probabilities.append(y_proba_l2) expanded_label_names.append(CATEGORIES[cat][1]) expanded_predictions = np.array(expanded_predictions).T expanded_probabilities = np.array(expanded_probabilities).T return expanded_predictions, expanded_probabilities, expanded_label_names def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame: """ Format the output predictions into a DataFrame. Args: predictions: Binary predictions. probabilities: Associated prediction scores. label_names: List of label names. Returns: DataFrame with columns "Label", "Prediction", and "Score". """ # As our Gradio interface processes one text at a time, we use the first (and only) sample. data = { "Label": label_names, "Prediction": predictions[0].tolist(), "Score": np.round(probabilities[0], 4).tolist() } return pd.DataFrame(data) # --- Gradio App Integration --- # Define model file path (adjust as necessary) MODEL_FILE = "model.joblib" def classify_text(text: str): """ Given an input text, generates embeddings, runs the model inference, and returns a DataFrame of classification results. """ if not text.strip(): # Return an empty DataFrame if no text provided empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []}) return gr.update(value=empty_df, visible=True) # Obtain embeddings (input must be a list) embeddings = get_embeddings([text]) # Run inference on the embeddings using the new model file predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings) # Format the results to a DataFrame that Gradio can display df = format_output(predictions, probabilities, label_names) return gr.update(value=df, visible=True) with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface: input_text = gr.Textbox(lines=5, label="Input Text") submit_btn = gr.Button("Submit") output_table = gr.DataFrame(label="Classification Results", visible=False) submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table) if __name__ == "__main__": iface.launch()