Spaces:
Sleeping
Sleeping
File size: 2,793 Bytes
d727a16 4b1c6eb d727a16 094d202 376e087 2430162 376e087 00e69cc 2430162 4b945ef 376e087 4b945ef 2430162 376e087 581cd9f 00e69cc 376e087 4b1c6eb 4b945ef 4b1c6eb 376e087 3dfdc35 5b6d5fb d952d3e ba3e19e 3dfdc35 ba3e19e 3dfdc35 e7c7d84 a3ce6a9 e7c7d84 3152fdc e7c7d84 90a444c 94f8c57 90a444c 94f8c57 e7c7d84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import streamlit as st
import pandas as pd
st.markdown('# Semantic search and topic classification (v1)')
st.markdown(' - Author: hcontreras')
st.markdown(' - Description: We want to classify sentences into a predefined set of topics. We use semantic search with a pre-trained transformer and we embed the input sentences and find the score relative to each topic')
st.markdown('## A quick test')
st.markdown('As a test we can create an embedding for a movie title and explore each component with the slider. Have fun!')
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
input_sentence = st.text_input('Sentence', 'This is a test for a news article')
input_topic = st.selectbox(
'Topic',
('Space', 'Transportation', 'Health'))
#st.write('The current movie title is', title)
#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']
#Sentences are encoded by calling model.encode()
embedding_sentence = model.encode(input_sentence)
embedding_topics = model.encode(['Space','Transportation','Health'])
embedding_input_topic = model.encode(input_topic)
#x = st.slider('Select a value')
#embedding = model.encode(input_sentence)
#st.write(x, 'squared is', x * x, 'embedding', embedding[0][0])
#st.write('The embedding of', '"' + input_sentence + '"', 'at position',x,'is',embedding[0][int(x)])
cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0]
st.write('Score for topic', input_topic, ':', cos_scores)
st.markdown('## ')
uploaded_file1 = st.file_uploader("Choose a file: sentence list")
if uploaded_file1 is not None:
#read csv
df1=pd.read_csv(uploaded_file1)
st.write(df1.head())
uploaded_file2 = st.file_uploader("Choose a file: topic list")
if uploaded_file2 is not None:
#read csv
df2=pd.read_csv(uploaded_file2)
st.write(df2.head())
if uploaded_file1 is not None and uploaded_file2 is not None:
from sentence_transformers import SentenceTransformer, util
import torch
embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus = df1['sentence']
topics = df2['topic']
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
for topic in topics:
topic_embedding = embedder.encode(topic, convert_to_tensor=True)
cos_scores = util.cos_sim(topic_embedding, corpus_embeddings)[0]
df1[str(topic)] = cos_scores
st.write(df1)
@st.cache
def convert_df_to_csv(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
st.download_button(
label="Download data as CSV",
data=convert_df_to_csv(df1),
file_name='output.csv',
mime='text/csv',
)
|