Spaces:

sergiomar73
/

nlp-gpt3-zero-shot-classification-app

Sleeping

App Files Files Community

nlp-gpt3-zero-shot-classification-app / app.py

sergiomar73

Update app.py

234ed52 over 2 years ago

raw

history blame

5.47 kB

	# https://huggingface.co/tasks/token-classification
	# https://huggingface.co/spacy/en_core_web_sm
	# pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl

	import gradio as gr
	import os
	import time
	import openai
	import numpy as np
	import pandas as pd
	import spacy
	import en_core_web_sm
	import plotly.express as px

	openai.organization = os.environ.get('ORGANIZATION')
	openai.api_key = os.environ.get('API_KEY')

	nlp = spacy.load("en_core_web_sm")

	# The following text inside one of this categories: Entertainment, Business, Politics
	# This dull recreation of the animated film doesn’t strive for anything more than what was contained in the original version of this film and actually delivers less.
	# Category: Entertainment

	def text_to_sentences(text):
	doc = nlp(text)
	sentences = [ sentence.text for sentence in list(doc.sents) ]
	# print(sentences[:3])
	return sentences

	def calculate_embeddings_with_gpt3(text, engine="text-similarity-davinci-001", interval = 1.5, verbose=True):
	if verbose:
	print(f'Calculating embedding for {text}...')
	time.sleep(interval)
	response = openai.Embedding.create(
	input=text,
	engine=engine
	)
	embedding = response['data'][0]['embedding']
	return embedding

	def gpt3_zero_shot_classification(text, labels):

	df_sentences = pd.DataFrame(columns=['line', 'sentence', 'embedding'])
	for idx, sentence in enumerate(text_to_sentences(text)):
	embedding = calculate_embeddings_with_gpt3(sentence)
	# Create new row
	new_row = {
	'line': idx + 1,
	'sentence': sentence,
	'embedding': embedding
	}
	df_sentences = df_sentences.append(new_row, ignore_index=True)
	# print(df_sentences.shape)
	# df_sentences.head()



	targets = np.array([ np.array(value[0]) for value in df_phrases[["embedding"]].values ])
	# print(f"targets:{targets.shape}")
	df_cosines = pd.DataFrame(columns=['line'])

	for i, row in df_sentences.iterrows():
	line = f'{row["line"]:03}'
	# print(f'Calculating cosines for [ {line} ] {row["sentence"][:50]}...')
	source = np.array(row["embedding"])
	cosine = np.dot(targets,source)/(np.linalg.norm(targets, axis=1)*np.linalg.norm(source))
	# Create new row
	new_row = dict([(f"Cosine{f'{key:02}'}", value) for key, value in enumerate(cosine.flatten(), 1)])
	new_row["line"] = row["line"]
	df_cosines = df_cosines.append(new_row, ignore_index=True)

	df_cosines['line'] = df_cosines['line'].astype('int')
	# print(df_cosines.shape)
	# df_cosines.head(3)

	df_comparison = df_cosines #[(df_cosines.filter(regex='Cosine') > threshold).any(axis=1)]
	# print(df_comparison.shape)
	# df_comparison.head(3)

	threshold = threshold / 100

	df_results = pd.DataFrame(columns=['line', 'sentence', 'phrase', 'category', 'tag', 'similarity'])

	for i, row in df_comparison.iterrows():
	for n in range(1,64+1):
	col = f"Cosine{f'{n:02}'}"
	# if row[col] > threshold:
	phrase = df_phrases.loc[[ n - 1 ]]
	new_row = {
	'line': row["line"],
	'sentence': df_sentences.at[int(row["line"])-1,"sentence"],
	'phrase': df_phrases.at[n-1,"example"],
	'category': df_phrases.at[n-1,"category"],
	'tag': df_phrases.at[n-1,"label"],
	'similarity': row[col]
	}
	df_results = df_results.append(new_row, ignore_index=True)

	df_results['line'] = df_cosines['line'].astype('int')
	# print(df_results.shape)
	# df_results.head(3)

	df_summary = df_results.groupby(['tag'])['similarity'].agg('max').to_frame()
	df_summary['ok'] = np.where(df_summary['similarity'] > threshold, True, False)
	# df_summary

	fig = px.bar(
	df_summary,
	y='similarity',
	color='ok',
	color_discrete_map={ True: px.colors.qualitative.Plotly[2], False: px.colors.qualitative.Set2[7] },
	text='similarity',
	text_auto='.3f',
	labels={'tag': 'Category', 'similarity': 'Similarity'},
	title = f"{text[:200]}..."
	)
	fig.add_shape( # add a horizontal "target" line
	type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
	x0=0, x1=1, xref="paper", y0=threshold, y1=threshold, yref="y"
	)
	fig.update_traces(textfont_size=24, textangle=0, textposition="inside", cliponaxis=False)
	fig.update_yaxes(range=[0, 1])
	# fig.show()

	details = df_results.drop(labels='line',axis=1).sort_values(['tag','similarity'],ascending=[True,False]).groupby('tag').head(3).reset_index() .drop(labels='index',axis=1)

	res = df_summary['similarity'].to_dict()

	return res, fig, details

	# Gradio UI

	with gr.Blocks(css=".gradio-container { background-color: white; }") as demo:
	gr.Markdown(f"# GPT-3 Zero shot classification app")
	with gr.Row():
	context = gr.Textbox(lines=3, label="Context", placeholder="Context Here...")
	with gr.Row():
	threshold = gr.Slider(0, 100, 80)
	btn = gr.Button(value="Analyze!", variant="primary")
	with gr.Row():
	label = gr.Label()
	plot = gr.Plot()
	with gr.Row():
	grid = gr.Dataframe(wrap=True)
	btn.click(fn=gpt3_zero_shot_classification, inputs=[context,threshold], outputs=[label,plot,grid])
	gr.Examples(
	[
	[ "", "Entertainment, Business, Politics" ],
	[ "", "Entertainment, Business, Politics" ],
	[ "", "Entertainment, Business, Politics" ],
	[ "", "Entertainment, Business, Politics" ]
	],
	[context, threshold],
	fn=gpt3_zero_shot_classification
	)

	demo.launch(debug=True)