|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import os |
|
import time |
|
import openai |
|
import numpy as np |
|
import pandas as pd |
|
import spacy |
|
import en_core_web_sm |
|
import plotly.express as px |
|
|
|
openai.organization = os.environ.get('ORGANIZATION') |
|
openai.api_key = os.environ.get('API_KEY') |
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
|
|
|
|
def text_to_sentences(text): |
|
doc = nlp(text) |
|
sentences = [ sentence.text for sentence in list(doc.sents) ] |
|
|
|
return sentences |
|
|
|
def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True): |
|
if verbose: |
|
print(f'Calculating embedding for {text}...') |
|
time.sleep(interval) |
|
response = openai.Embedding.create( |
|
input=text, |
|
engine=engine |
|
) |
|
embedding = response['data'][0]['embedding'] |
|
return embedding |
|
|
|
def gpt3_zero_shot_classification(text, labels): |
|
|
|
df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding']) |
|
for idx, sentence in enumerate(text_to_sentences(text)): |
|
embedding = calculate_embeddings_with_gpt3(sentence) |
|
|
|
new_row = { |
|
'line': idx + 1, |
|
'sentence': sentence, |
|
'embedding': embedding |
|
} |
|
df_sentences = df_sentences.append(new_row, ignore_index=True) |
|
|
|
|
|
|
|
|
|
|
|
targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ]) |
|
|
|
df_cosines = pd.DataFrame(columns=['line']) |
|
|
|
for i, row in df_sentences.iterrows(): |
|
line = f'{row["line"]:03}' |
|
|
|
source = np.array(row["embedding"]) |
|
cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source)) |
|
|
|
new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)]) |
|
new_row["line"] = row["line"] |
|
df_cosines = df_cosines.append(new_row, ignore_index=True) |
|
|
|
df_cosines['line'] = df_cosines['line'].astype('int') |
|
|
|
|
|
|
|
df_comparison = df_cosines |
|
|
|
|
|
|
|
threshold = threshold / 100 |
|
|
|
df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity']) |
|
|
|
for i, row in df_comparison.iterrows(): |
|
for n in range(1,64+1): |
|
col = f"Cosine{f'{n:02}'}" |
|
|
|
phrase = df_phrases.loc[[ n - 1 ]] |
|
new_row = { |
|
'line': row["line"], |
|
'sentence': df_sentences.at[int(row["line"])-1,"sentence"], |
|
'phrase': df_phrases.at[n-1,"example"], |
|
'category': df_phrases.at[n-1,"category"], |
|
'tag': df_phrases.at[n-1,"label"], |
|
'similarity': row[col] |
|
} |
|
df_results = df_results.append(new_row, ignore_index=True) |
|
|
|
df_results['line'] = df_cosines['line'].astype('int') |
|
|
|
|
|
|
|
df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame() |
|
df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False) |
|
|
|
|
|
fig = px.bar( |
|
df_summary, |
|
y='similarity', |
|
color='ok', |
|
color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] }, |
|
text='similarity', |
|
text_auto='.3f', |
|
labels={'tag': 'Category', 'similarity': 'Similarity'}, |
|
title = f"{text[:200]}..." |
|
) |
|
fig.add_shape( |
|
type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot", |
|
x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y" |
|
) |
|
fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False) |
|
fig.update_yaxes(range=[0, 1]) |
|
|
|
|
|
details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1) |
|
|
|
res = df_summary['similarity'].to_dict() |
|
|
|
return res, fig, details |
|
|
|
|
|
|
|
with gr.Blocks(css=".gradio-container { background-color: white; }") as demo: |
|
gr.Markdown(f"# GPT-3 Zero shot classification app") |
|
with gr.Row(): |
|
context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...") |
|
with gr.Row(): |
|
threshold = gr.Slider(0, 100, 80) |
|
btn = gr.Button(value="Analyze!", variant="primary") |
|
with gr.Row(): |
|
label = gr.Label() |
|
plot = gr.Plot() |
|
with gr.Row(): |
|
grid = gr.Dataframe(wrap=True) |
|
btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid]) |
|
gr.Examples( |
|
[ |
|
[ "", "Entertainment, Business, Politics" ], |
|
[ "", "Entertainment, Business, Politics" ], |
|
[ "", "Entertainment, Business, Politics" ], |
|
[ "", "Entertainment, Business, Politics" ] |
|
], |
|
[context, threshold], |
|
fn=gpt3_zero_shot_classification |
|
) |
|
|
|
demo.launch(debug=True) |