Spaces:

gabrielchua
/

refactored-guacamole

Sleeping

App Files Files Community

refactored-guacamole / app.py

gabrielchua

Update app.py

c657583 verified 13 days ago

raw

history blame contribute delete

6.21 kB

	import os
	import gradio as gr
	import joblib
	import numpy as np
	import pandas as pd
	from openai import OpenAI
	from typing import List, Dict, Any

	# --- New Inference Code Components ---

	# Define categories with sub-level information
	CATEGORIES = {
	'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'],
	'insults': ['insults'],
	'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'],
	'physical_violence': ['physical_violence'],
	'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'],
	'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal']
	}

	def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
	"""
	Generate embeddings for a list of texts using the OpenAI API synchronously.

	Args:
	texts: List of strings to embed.
	model: The OpenAI embedding model to use.

	Returns:
	A numpy array of embeddings.
	"""
	client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
	MAX_TOKENS = 8191 # Maximum tokens for the embedding model
	truncated_texts = [text[:MAX_TOKENS] for text in texts]

	response = client.embeddings.create(
	input=truncated_texts,
	model=model
	)

	embeddings = np.array([data.embedding for data in response.data])
	return embeddings

	def run_model(model_file: str, embeddings: np.ndarray):
	"""
	Run the model on the embeddings.

	Args:
	model_file: Path to the model file.
	embeddings: Numpy array of embeddings.

	Returns:
	expanded_predictions, expanded_probabilities, expanded_label_names
	"""
	print("Loading model...")
	model_data = joblib.load(model_file)
	model = model_data['model']
	label_names = model_data['label_names']

	print("Predicting...")
	# raw_predictions is a list of arrays with shape (n_samples, 2)
	raw_predictions = model.predict(embeddings)

	print("Processing predictions...")
	predictions = []
	probabilities = []
	# Process each category's raw predictions
	for i, pred in enumerate(raw_predictions):
	# Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
	pred_class = np.zeros(len(pred))
	pred_class += (pred[:, 0] > 0.5).astype(int) # y > 0
	pred_class += (pred[:, 1] > 0.5).astype(int) # y > 1
	predictions.append(pred_class)

	# Calculate probabilities for each class:
	# P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
	prob = np.zeros((len(pred), 3))
	prob[:, 0] = 1 - pred[:, 0]
	prob[:, 1] = pred[:, 0] - pred[:, 1]
	prob[:, 2] = pred[:, 1]
	probabilities.append(prob)

	predictions = np.array(predictions).T
	probabilities = np.array(probabilities).transpose(1, 0, 2)

	# Expand predictions to sub-levels
	expanded_predictions = []
	expanded_probabilities = []
	expanded_label_names = []
	for i, cat in enumerate(label_names):
	# Level 1 binary
	y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2
	y_proba_l1 = 1 - probabilities[:, i, 0] # 1 - P(class 0)

	# Level 2 binary
	y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2
	y_proba_l2 = probabilities[:, i, 2] # Probability of class 2

	if cat in ['binary', 'insults', 'physical_violence']:
	expanded_predictions.append(y_pred_l1)
	expanded_probabilities.append(y_proba_l1)
	expanded_label_names.append(cat)
	else:
	expanded_predictions.append(y_pred_l1)
	expanded_probabilities.append(y_proba_l1)
	expanded_label_names.append(CATEGORIES[cat][0])

	expanded_predictions.append(y_pred_l2)
	expanded_probabilities.append(y_proba_l2)
	expanded_label_names.append(CATEGORIES[cat][1])

	expanded_predictions = np.array(expanded_predictions).T
	expanded_probabilities = np.array(expanded_probabilities).T

	return expanded_predictions, expanded_probabilities, expanded_label_names

	def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
	"""
	Format the output predictions into a DataFrame.

	Args:
	predictions: Binary predictions.
	probabilities: Associated prediction scores.
	label_names: List of label names.

	Returns:
	DataFrame with columns "Label", "Prediction", and "Score".
	"""
	# As our Gradio interface processes one text at a time, we use the first (and only) sample.
	data = {
	"Label": label_names,
	"Prediction": predictions[0].tolist(),
	"Score": np.round(probabilities[0], 4).tolist()
	}
	return pd.DataFrame(data)

	# --- Gradio App Integration ---

	# Define model file path (adjust as necessary)
	MODEL_FILE = "model.joblib"

	def classify_text(text: str):
	"""
	Given an input text, generates embeddings, runs the model inference,
	and returns a DataFrame of classification results.
	"""
	if not text.strip():
	# Return an empty DataFrame if no text provided
	empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
	return gr.update(value=empty_df, visible=True)

	# Obtain embeddings (input must be a list)
	embeddings = get_embeddings([text])

	# Run inference on the embeddings using the new model file
	predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)

	# Format the results to a DataFrame that Gradio can display
	df = format_output(predictions, probabilities, label_names)
	return gr.update(value=df, visible=True)

	with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
	input_text = gr.Textbox(lines=5, label="Input Text")
	submit_btn = gr.Button("Submit")
	output_table = gr.DataFrame(label="Classification Results", visible=False)

	submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table)

	if __name__ == "__main__":
	iface.launch()