gabrielchua's picture
Update app.py
c657583 verified
import os
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any
# --- New Inference Code Components ---
# Define categories with sub-level information
CATEGORIES = {
'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'],
'insults': ['insults'],
'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'],
'physical_violence': ['physical_violence'],
'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'],
'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal']
}
def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
"""
Generate embeddings for a list of texts using the OpenAI API synchronously.
Args:
texts: List of strings to embed.
model: The OpenAI embedding model to use.
Returns:
A numpy array of embeddings.
"""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
MAX_TOKENS = 8191 # Maximum tokens for the embedding model
truncated_texts = [text[:MAX_TOKENS] for text in texts]
response = client.embeddings.create(
input=truncated_texts,
model=model
)
embeddings = np.array([data.embedding for data in response.data])
return embeddings
def run_model(model_file: str, embeddings: np.ndarray):
"""
Run the model on the embeddings.
Args:
model_file: Path to the model file.
embeddings: Numpy array of embeddings.
Returns:
expanded_predictions, expanded_probabilities, expanded_label_names
"""
print("Loading model...")
model_data = joblib.load(model_file)
model = model_data['model']
label_names = model_data['label_names']
print("Predicting...")
# raw_predictions is a list of arrays with shape (n_samples, 2)
raw_predictions = model.predict(embeddings)
print("Processing predictions...")
predictions = []
probabilities = []
# Process each category's raw predictions
for i, pred in enumerate(raw_predictions):
# Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
pred_class = np.zeros(len(pred))
pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0
pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1
predictions.append(pred_class)
# Calculate probabilities for each class:
# P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
prob = np.zeros((len(pred), 3))
prob[:, 0] = 1 - pred[:, 0]
prob[:, 1] = pred[:, 0] - pred[:, 1]
prob[:, 2] = pred[:, 1]
probabilities.append(prob)
predictions = np.array(predictions).T
probabilities = np.array(probabilities).transpose(1, 0, 2)
# Expand predictions to sub-levels
expanded_predictions = []
expanded_probabilities = []
expanded_label_names = []
for i, cat in enumerate(label_names):
# Level 1 binary
y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2
y_proba_l1 = 1 - probabilities[:, i, 0] # 1 - P(class 0)
# Level 2 binary
y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2
y_proba_l2 = probabilities[:, i, 2] # Probability of class 2
if cat in ['binary', 'insults', 'physical_violence']:
expanded_predictions.append(y_pred_l1)
expanded_probabilities.append(y_proba_l1)
expanded_label_names.append(cat)
else:
expanded_predictions.append(y_pred_l1)
expanded_probabilities.append(y_proba_l1)
expanded_label_names.append(CATEGORIES[cat][0])
expanded_predictions.append(y_pred_l2)
expanded_probabilities.append(y_proba_l2)
expanded_label_names.append(CATEGORIES[cat][1])
expanded_predictions = np.array(expanded_predictions).T
expanded_probabilities = np.array(expanded_probabilities).T
return expanded_predictions, expanded_probabilities, expanded_label_names
def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
"""
Format the output predictions into a DataFrame.
Args:
predictions: Binary predictions.
probabilities: Associated prediction scores.
label_names: List of label names.
Returns:
DataFrame with columns "Label", "Prediction", and "Score".
"""
# As our Gradio interface processes one text at a time, we use the first (and only) sample.
data = {
"Label": label_names,
"Prediction": predictions[0].tolist(),
"Score": np.round(probabilities[0], 4).tolist()
}
return pd.DataFrame(data)
# --- Gradio App Integration ---
# Define model file path (adjust as necessary)
MODEL_FILE = "model.joblib"
def classify_text(text: str):
"""
Given an input text, generates embeddings, runs the model inference,
and returns a DataFrame of classification results.
"""
if not text.strip():
# Return an empty DataFrame if no text provided
empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
return gr.update(value=empty_df, visible=True)
# Obtain embeddings (input must be a list)
embeddings = get_embeddings([text])
# Run inference on the embeddings using the new model file
predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
# Format the results to a DataFrame that Gradio can display
df = format_output(predictions, probabilities, label_names)
return gr.update(value=df, visible=True)
with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
input_text = gr.Textbox(lines=5, label="Input Text")
submit_btn = gr.Button("Submit")
output_table = gr.DataFrame(label="Classification Results", visible=False)
submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table)
if __name__ == "__main__":
iface.launch()