Spaces:
Sleeping
Sleeping
File size: 6,214 Bytes
2499f75 7246298 4c7a460 9eb8e79 4c7a460 99df3cb 4c7a460 d7bae94 4c7a460 d7bae94 4c7a460 d7bae94 4c7a460 7246298 4c7a460 92f7782 4c7a460 7a3e76a 4c7a460 7a3e76a c657583 4c7a460 c657583 4c7a460 7246298 4c7a460 7246298 4c7a460 7246298 4c7a460 7246298 71b879a 7246298 4c7a460 7246298 4c7a460 71b879a 4c7a460 71b879a 4c7a460 7246298 4c7a460 71b879a 7246298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any
# --- New Inference Code Components ---
# Define categories with sub-level information
CATEGORIES = {
'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'],
'insults': ['insults'],
'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'],
'physical_violence': ['physical_violence'],
'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'],
'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal']
}
def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
"""
Generate embeddings for a list of texts using the OpenAI API synchronously.
Args:
texts: List of strings to embed.
model: The OpenAI embedding model to use.
Returns:
A numpy array of embeddings.
"""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
MAX_TOKENS = 8191 # Maximum tokens for the embedding model
truncated_texts = [text[:MAX_TOKENS] for text in texts]
response = client.embeddings.create(
input=truncated_texts,
model=model
)
embeddings = np.array([data.embedding for data in response.data])
return embeddings
def run_model(model_file: str, embeddings: np.ndarray):
"""
Run the model on the embeddings.
Args:
model_file: Path to the model file.
embeddings: Numpy array of embeddings.
Returns:
expanded_predictions, expanded_probabilities, expanded_label_names
"""
print("Loading model...")
model_data = joblib.load(model_file)
model = model_data['model']
label_names = model_data['label_names']
print("Predicting...")
# raw_predictions is a list of arrays with shape (n_samples, 2)
raw_predictions = model.predict(embeddings)
print("Processing predictions...")
predictions = []
probabilities = []
# Process each category's raw predictions
for i, pred in enumerate(raw_predictions):
# Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
pred_class = np.zeros(len(pred))
pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0
pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1
predictions.append(pred_class)
# Calculate probabilities for each class:
# P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
prob = np.zeros((len(pred), 3))
prob[:, 0] = 1 - pred[:, 0]
prob[:, 1] = pred[:, 0] - pred[:, 1]
prob[:, 2] = pred[:, 1]
probabilities.append(prob)
predictions = np.array(predictions).T
probabilities = np.array(probabilities).transpose(1, 0, 2)
# Expand predictions to sub-levels
expanded_predictions = []
expanded_probabilities = []
expanded_label_names = []
for i, cat in enumerate(label_names):
# Level 1 binary
y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2
y_proba_l1 = 1 - probabilities[:, i, 0] # 1 - P(class 0)
# Level 2 binary
y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2
y_proba_l2 = probabilities[:, i, 2] # Probability of class 2
if cat in ['binary', 'insults', 'physical_violence']:
expanded_predictions.append(y_pred_l1)
expanded_probabilities.append(y_proba_l1)
expanded_label_names.append(cat)
else:
expanded_predictions.append(y_pred_l1)
expanded_probabilities.append(y_proba_l1)
expanded_label_names.append(CATEGORIES[cat][0])
expanded_predictions.append(y_pred_l2)
expanded_probabilities.append(y_proba_l2)
expanded_label_names.append(CATEGORIES[cat][1])
expanded_predictions = np.array(expanded_predictions).T
expanded_probabilities = np.array(expanded_probabilities).T
return expanded_predictions, expanded_probabilities, expanded_label_names
def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
"""
Format the output predictions into a DataFrame.
Args:
predictions: Binary predictions.
probabilities: Associated prediction scores.
label_names: List of label names.
Returns:
DataFrame with columns "Label", "Prediction", and "Score".
"""
# As our Gradio interface processes one text at a time, we use the first (and only) sample.
data = {
"Label": label_names,
"Prediction": predictions[0].tolist(),
"Score": np.round(probabilities[0], 4).tolist()
}
return pd.DataFrame(data)
# --- Gradio App Integration ---
# Define model file path (adjust as necessary)
MODEL_FILE = "model.joblib"
def classify_text(text: str):
"""
Given an input text, generates embeddings, runs the model inference,
and returns a DataFrame of classification results.
"""
if not text.strip():
# Return an empty DataFrame if no text provided
empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
return gr.update(value=empty_df, visible=True)
# Obtain embeddings (input must be a list)
embeddings = get_embeddings([text])
# Run inference on the embeddings using the new model file
predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
# Format the results to a DataFrame that Gradio can display
df = format_output(predictions, probabilities, label_names)
return gr.update(value=df, visible=True)
with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
input_text = gr.Textbox(lines=5, label="Input Text")
submit_btn = gr.Button("Submit")
output_table = gr.DataFrame(label="Classification Results", visible=False)
submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table)
if __name__ == "__main__":
iface.launch()
|