Spaces:

hacpdsae2023
/

test

Sleeping

File size: 2,793 Bytes

d727a16
4b1c6eb
d727a16
094d202
 
 
 
376e087
 
 
2430162
 
 
376e087
 
 
 
00e69cc
 
2430162
 
 
 
 
4b945ef
376e087
4b945ef
2430162
376e087
581cd9f
00e69cc
376e087
4b1c6eb
4b945ef
 
 
4b1c6eb
376e087
3dfdc35
 
5b6d5fb
d952d3e
ba3e19e
3dfdc35
 
 
 
ba3e19e
 
3dfdc35
e7c7d84
 
 
 
 
 
a3ce6a9
e7c7d84
 
 
 
3152fdc
e7c7d84
 
 
 
90a444c
 
 
 
 
 
94f8c57
90a444c
 
 
 
94f8c57
 
e7c7d84

import streamlit as st
import pandas as pd

st.markdown('# Semantic search and topic classification (v1)')
st.markdown(' - Author: hcontreras')
st.markdown(' - Description: We want to classify sentences into a predefined set of topics. We use semantic search with a pre-trained transformer and we embed the input sentences and find the score relative to each topic')

st.markdown('## A quick test')
st.markdown('As a test we can create an embedding for a movie title and explore each component with the slider. Have fun!')

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

input_sentence = st.text_input('Sentence', 'This is a test for a news article')
input_topic = st.selectbox(
    'Topic',
    ('Space', 'Transportation', 'Health'))
#st.write('The current movie title is', title)

#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']


#Sentences are encoded by calling model.encode()
embedding_sentence = model.encode(input_sentence)
embedding_topics = model.encode(['Space','Transportation','Health'])
embedding_input_topic = model.encode(input_topic)

#x = st.slider('Select a value')
#embedding = model.encode(input_sentence)
#st.write(x, 'squared is', x * x, 'embedding', embedding[0][0])
#st.write('The embedding of', '"' + input_sentence + '"', 'at position',x,'is',embedding[0][int(x)])

cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0]

st.write('Score for topic', input_topic, ':', cos_scores)

st.markdown('## ')
uploaded_file1 = st.file_uploader("Choose a file: sentence list")
if uploaded_file1 is not None:
    #read csv
    df1=pd.read_csv(uploaded_file1)
    st.write(df1.head())

uploaded_file2 = st.file_uploader("Choose a file: topic list")
if uploaded_file2 is not None:
    #read csv
    df2=pd.read_csv(uploaded_file2)
    st.write(df2.head())

if uploaded_file1 is not None and uploaded_file2 is not None:
    from sentence_transformers import SentenceTransformer, util
    import torch

    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    corpus = df1['sentence']
    topics = df2['topic']

    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    for topic in topics:
        topic_embedding = embedder.encode(topic, convert_to_tensor=True)
        cos_scores = util.cos_sim(topic_embedding, corpus_embeddings)[0]
        df1[str(topic)] = cos_scores

    st.write(df1)

    @st.cache
    def convert_df_to_csv(df):
      # IMPORTANT: Cache the conversion to prevent computation on every rerun
      return df.to_csv().encode('utf-8')


    st.download_button(
      label="Download data as CSV",
      data=convert_df_to_csv(df1),
      file_name='output.csv',
      mime='text/csv',
    )