sergiomar73's picture
Update app.py
234ed52
raw
history blame
5.47 kB
# https://huggingface.co/tasks/token-classification
# https://huggingface.co/spacy/en_core_web_sm
# pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
import gradio as gr
import os
import time
import openai
import numpy as np
import pandas as pd
import spacy
import en_core_web_sm
import plotly.express as px
openai.organization = os.environ.get('ORGANIZATION')
openai.api_key = os.environ.get('API_KEY')
nlp = spacy.load("en_core_web_sm")
# The following text inside one of this categories: Entertainment, Business, Politics
# This dull recreation of the animated film doesn’t strive for anything more than what was contained in the original version of this film and actually delivers less.
# Category: Entertainment
def text_to_sentences(text):
doc = nlp(text)
sentences = [ sentence.text for sentence in list(doc.sents) ]
# print(sentences[:3])
return sentences
def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
if verbose:
print(f'Calculating embedding for {text}...')
time.sleep(interval)
response = openai.Embedding.create(
input=text,
engine=engine
)
embedding = response['data'][0]['embedding']
return embedding
def gpt3_zero_shot_classification(text, labels):
df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
for idx, sentence in enumerate(text_to_sentences(text)):
embedding = calculate_embeddings_with_gpt3(sentence)
# Create new row
new_row = {
'line': idx + 1,
'sentence': sentence,
'embedding': embedding
}
df_sentences = df_sentences.append(new_row, ignore_index=True)
# print(df_sentences.shape)
# df_sentences.head()
targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
# print(f"targets:{targets.shape}")
df_cosines = pd.DataFrame(columns=['line'])
for i, row in df_sentences.iterrows():
line = f'{row["line"]:03}'
# print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
source = np.array(row["embedding"])
cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
# Create new row
new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
new_row["line"] = row["line"]
df_cosines = df_cosines.append(new_row, ignore_index=True)
df_cosines['line'] = df_cosines['line'].astype('int')
# print(df_cosines.shape)
# df_cosines.head(3)
df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
# print(df_comparison.shape)
# df_comparison.head(3)
threshold = threshold / 100
df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])
for i, row in df_comparison.iterrows():
for n in range(1,64+1):
col = f"Cosine{f'{n:02}'}"
# if row[col] > threshold:
phrase = df_phrases.loc[[ n - 1 ]]
new_row = {
'line': row["line"],
'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
'phrase': df_phrases.at[n-1,"example"],
'category': df_phrases.at[n-1,"category"],
'tag': df_phrases.at[n-1,"label"],
'similarity': row[col]
}
df_results = df_results.append(new_row, ignore_index=True)
df_results['line'] = df_cosines['line'].astype('int')
# print(df_results.shape)
# df_results.head(3)
df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
# df_summary
fig = px.bar(
df_summary,
y='similarity',
color='ok',
color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
text='similarity',
text_auto='.3f',
labels={'tag': 'Category', 'similarity': 'Similarity'},
title = f"{text[:200]}..."
)
fig.add_shape( # add a horizontal "target" line
type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
)
fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
fig.update_yaxes(range=[0, 1])
# fig.show()
details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1)
res = df_summary['similarity'].to_dict()
return res, fig, details
# Gradio UI
with gr.Blocks(css=".gradio-container { background-color: white; }") as demo:
gr.Markdown(f"# GPT-3 Zero shot classification app")
with gr.Row():
context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...")
with gr.Row():
threshold = gr.Slider(0, 100, 80)
btn = gr.Button(value="Analyze!", variant="primary")
with gr.Row():
label = gr.Label()
plot = gr.Plot()
with gr.Row():
grid = gr.Dataframe(wrap=True)
btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid])
gr.Examples(
[
[ "", "Entertainment, Business, Politics" ],
[ "", "Entertainment, Business, Politics" ],
[ "", "Entertainment, Business, Politics" ],
[ "", "Entertainment, Business, Politics" ]
],
[context, threshold],
fn=gpt3_zero_shot_classification
)
demo.launch(debug=True)