Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import joblib | |
import numpy as np | |
import pandas as pd | |
from openai import OpenAI | |
from typing import List, Dict, Any | |
# --- New Inference Code Components --- | |
# Define categories with sub-level information | |
CATEGORIES = { | |
'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'], | |
'insults': ['insults'], | |
'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'], | |
'physical_violence': ['physical_violence'], | |
'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'], | |
'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal'] | |
} | |
def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray: | |
""" | |
Generate embeddings for a list of texts using the OpenAI API synchronously. | |
Args: | |
texts: List of strings to embed. | |
model: The OpenAI embedding model to use. | |
Returns: | |
A numpy array of embeddings. | |
""" | |
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | |
MAX_TOKENS = 8191 # Maximum tokens for the embedding model | |
truncated_texts = [text[:MAX_TOKENS] for text in texts] | |
response = client.embeddings.create( | |
input=truncated_texts, | |
model=model | |
) | |
embeddings = np.array([data.embedding for data in response.data]) | |
return embeddings | |
def run_model(model_file: str, embeddings: np.ndarray): | |
""" | |
Run the model on the embeddings. | |
Args: | |
model_file: Path to the model file. | |
embeddings: Numpy array of embeddings. | |
Returns: | |
expanded_predictions, expanded_probabilities, expanded_label_names | |
""" | |
print("Loading model...") | |
model_data = joblib.load(model_file) | |
model = model_data['model'] | |
label_names = model_data['label_names'] | |
print("Predicting...") | |
# raw_predictions is a list of arrays with shape (n_samples, 2) | |
raw_predictions = model.predict(embeddings) | |
print("Processing predictions...") | |
predictions = [] | |
probabilities = [] | |
# Process each category's raw predictions | |
for i, pred in enumerate(raw_predictions): | |
# Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2} | |
pred_class = np.zeros(len(pred)) | |
pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0 | |
pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1 | |
predictions.append(pred_class) | |
# Calculate probabilities for each class: | |
# P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1) | |
prob = np.zeros((len(pred), 3)) | |
prob[:, 0] = 1 - pred[:, 0] | |
prob[:, 1] = pred[:, 0] - pred[:, 1] | |
prob[:, 2] = pred[:, 1] | |
probabilities.append(prob) | |
predictions = np.array(predictions).T | |
probabilities = np.array(probabilities).transpose(1, 0, 2) | |
# Expand predictions to sub-levels | |
expanded_predictions = [] | |
expanded_probabilities = [] | |
expanded_label_names = [] | |
for i, cat in enumerate(label_names): | |
# Level 1 binary | |
y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2 | |
y_proba_l1 = 1 - probabilities[:, i, 0] # 1 - P(class 0) | |
# Level 2 binary | |
y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2 | |
y_proba_l2 = probabilities[:, i, 2] # Probability of class 2 | |
if cat in ['binary', 'insults', 'physical_violence']: | |
expanded_predictions.append(y_pred_l1) | |
expanded_probabilities.append(y_proba_l1) | |
expanded_label_names.append(cat) | |
else: | |
expanded_predictions.append(y_pred_l1) | |
expanded_probabilities.append(y_proba_l1) | |
expanded_label_names.append(CATEGORIES[cat][0]) | |
expanded_predictions.append(y_pred_l2) | |
expanded_probabilities.append(y_proba_l2) | |
expanded_label_names.append(CATEGORIES[cat][1]) | |
expanded_predictions = np.array(expanded_predictions).T | |
expanded_probabilities = np.array(expanded_probabilities).T | |
return expanded_predictions, expanded_probabilities, expanded_label_names | |
def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame: | |
""" | |
Format the output predictions into a DataFrame. | |
Args: | |
predictions: Binary predictions. | |
probabilities: Associated prediction scores. | |
label_names: List of label names. | |
Returns: | |
DataFrame with columns "Label", "Prediction", and "Score". | |
""" | |
# As our Gradio interface processes one text at a time, we use the first (and only) sample. | |
data = { | |
"Label": label_names, | |
"Prediction": predictions[0].tolist(), | |
"Score": np.round(probabilities[0], 4).tolist() | |
} | |
return pd.DataFrame(data) | |
# --- Gradio App Integration --- | |
# Define model file path (adjust as necessary) | |
MODEL_FILE = "model.joblib" | |
def classify_text(text: str): | |
""" | |
Given an input text, generates embeddings, runs the model inference, | |
and returns a DataFrame of classification results. | |
""" | |
if not text.strip(): | |
# Return an empty DataFrame if no text provided | |
empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []}) | |
return gr.update(value=empty_df, visible=True) | |
# Obtain embeddings (input must be a list) | |
embeddings = get_embeddings([text]) | |
# Run inference on the embeddings using the new model file | |
predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings) | |
# Format the results to a DataFrame that Gradio can display | |
df = format_output(predictions, probabilities, label_names) | |
return gr.update(value=df, visible=True) | |
with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface: | |
input_text = gr.Textbox(lines=5, label="Input Text") | |
submit_btn = gr.Button("Submit") | |
output_table = gr.DataFrame(label="Classification Results", visible=False) | |
submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table) | |
if __name__ == "__main__": | |
iface.launch() | |