File size: 6,214 Bytes
2499f75
7246298
 
 
 
 
4c7a460
9eb8e79
4c7a460
99df3cb
4c7a460
 
d7bae94
4c7a460
d7bae94
4c7a460
d7bae94
 
4c7a460
7246298
4c7a460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92f7782
4c7a460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a3e76a
 
 
4c7a460
7a3e76a
 
 
c657583
4c7a460
c657583
 
4c7a460
 
 
 
 
 
 
 
 
 
 
 
 
 
7246298
4c7a460
7246298
4c7a460
 
 
 
 
 
 
 
 
7246298
4c7a460
 
 
 
 
 
 
 
 
 
 
 
7246298
71b879a
7246298
4c7a460
 
7246298
4c7a460
 
 
 
71b879a
4c7a460
 
 
 
 
71b879a
4c7a460
 
7246298
 
4c7a460
71b879a
 
 
7246298
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any

# --- New Inference Code Components ---

# Define categories with sub-level information
CATEGORIES = {
    'hateful': ['hateful_lvl_1_discriminatory', 'hateful_lvl_2_hate_speech'],
    'insults': ['insults'],
    'sexual': ['sexual_lvl_1_not_appropriate_for_minors', 'sexual_lvl_2_not_appropriate_for_all'],
    'physical_violence': ['physical_violence'],
    'self_harm': ['self_harm_lvl_1_intent', 'self_harm_lvl_2_action'],
    'all_other_misconduct': ['all_other_misconduct_lvl_1_not_socially_accepted', 'all_other_misconduct_lvl_2_illegal']
}

def get_embeddings(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
    """
    Generate embeddings for a list of texts using the OpenAI API synchronously.
    
    Args:
        texts: List of strings to embed.
        model: The OpenAI embedding model to use.
    
    Returns:
        A numpy array of embeddings.
    """
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    MAX_TOKENS = 8191  # Maximum tokens for the embedding model
    truncated_texts = [text[:MAX_TOKENS] for text in texts]
    
    response = client.embeddings.create(
        input=truncated_texts,
        model=model
    )
    
    embeddings = np.array([data.embedding for data in response.data])
    return embeddings

def run_model(model_file: str, embeddings: np.ndarray):
    """
    Run the model on the embeddings.
    
    Args:
        model_file: Path to the model file.
        embeddings: Numpy array of embeddings.
    
    Returns:
        expanded_predictions, expanded_probabilities, expanded_label_names
    """
    print("Loading model...")
    model_data = joblib.load(model_file)
    model = model_data['model']
    label_names = model_data['label_names']
    
    print("Predicting...")
    # raw_predictions is a list of arrays with shape (n_samples, 2)
    raw_predictions = model.predict(embeddings)
    
    print("Processing predictions...")
    predictions = []
    probabilities = []
    # Process each category's raw predictions
    for i, pred in enumerate(raw_predictions):
        # Convert raw predictions (P(y>0), P(y>1)) into a class from {0, 1, 2}
        pred_class = np.zeros(len(pred))
        pred_class += (pred[:, 0] > 0.5).astype(int)  # y > 0
        pred_class += (pred[:, 1] > 0.5).astype(int)  # y > 1
        predictions.append(pred_class)
        
        # Calculate probabilities for each class:
        # P(y=0) = 1 - P(y>0), P(y=1) = P(y>0) - P(y>1), P(y=2) = P(y>1)
        prob = np.zeros((len(pred), 3))
        prob[:, 0] = 1 - pred[:, 0]
        prob[:, 1] = pred[:, 0] - pred[:, 1]
        prob[:, 2] = pred[:, 1]
        probabilities.append(prob)
    
    predictions = np.array(predictions).T
    probabilities = np.array(probabilities).transpose(1, 0, 2)
    
    # Expand predictions to sub-levels
    expanded_predictions = []
    expanded_probabilities = []
    expanded_label_names = []
    for i, cat in enumerate(label_names):
        # Level 1 binary 
        y_pred_l1 = (predictions[:, i] > 0).astype(int) # y == 1 or y == 2
        y_proba_l1 = 1 - probabilities[:, i, 0]  # 1 - P(class 0)
        
        # Level 2 binary 
        y_pred_l2 = (predictions[:, i] == 2).astype(int) # only y == 2
        y_proba_l2 = probabilities[:, i, 2]  # Probability of class 2

        if cat in ['binary', 'insults', 'physical_violence']:
            expanded_predictions.append(y_pred_l1)
            expanded_probabilities.append(y_proba_l1)
            expanded_label_names.append(cat)
        else:
            expanded_predictions.append(y_pred_l1)
            expanded_probabilities.append(y_proba_l1)
            expanded_label_names.append(CATEGORIES[cat][0])
            
            expanded_predictions.append(y_pred_l2)
            expanded_probabilities.append(y_proba_l2)
            expanded_label_names.append(CATEGORIES[cat][1])
    
    expanded_predictions = np.array(expanded_predictions).T
    expanded_probabilities = np.array(expanded_probabilities).T
    
    return expanded_predictions, expanded_probabilities, expanded_label_names

def format_output(predictions: np.ndarray, probabilities: np.ndarray, label_names: List[str]) -> pd.DataFrame:
    """
    Format the output predictions into a DataFrame.
    
    Args:
        predictions: Binary predictions.
        probabilities: Associated prediction scores.
        label_names: List of label names.
    
    Returns:
        DataFrame with columns "Label", "Prediction", and "Score".
    """
    # As our Gradio interface processes one text at a time, we use the first (and only) sample.
    data = {
        "Label": label_names,
        "Prediction": predictions[0].tolist(),
        "Score": np.round(probabilities[0], 4).tolist()
    }
    return pd.DataFrame(data)

# --- Gradio App Integration ---

# Define model file path (adjust as necessary)
MODEL_FILE = "model.joblib"

def classify_text(text: str):
    """
    Given an input text, generates embeddings, runs the model inference,
    and returns a DataFrame of classification results.
    """
    if not text.strip():
        # Return an empty DataFrame if no text provided
        empty_df = pd.DataFrame({"Label": [], "Prediction": [], "Score": []})
        return gr.update(value=empty_df, visible=True)
    
    # Obtain embeddings (input must be a list)
    embeddings = get_embeddings([text])
    
    # Run inference on the embeddings using the new model file
    predictions, probabilities, label_names = run_model(MODEL_FILE, embeddings)
    
    # Format the results to a DataFrame that Gradio can display
    df = format_output(predictions, probabilities, label_names)
    return gr.update(value=df, visible=True)

with gr.Blocks(title="Zoo Entry 001 - Updated Inference") as iface:
    input_text = gr.Textbox(lines=5, label="Input Text")
    submit_btn = gr.Button("Submit")
    output_table = gr.DataFrame(label="Classification Results", visible=False)
    
    submit_btn.click(fn=classify_text, inputs=input_text, outputs=output_table)

if __name__ == "__main__":
    iface.launch()