File size: 1,825 Bytes
935a660
eabf510
82413ee
eabf510
0b6790a
4480f3c
d0d62c4
 
4480f3c
09e6c30
ee5ab0e
 
09e6c30
eabf510
 
 
 
 
 
91b81a0
 
 
 
 
eabf510
 
 
4480f3c
ee5ab0e
 
 
 
eabf510
ee5ab0e
 
 
eabf510
ee5ab0e
 
78dfe4e
4168128
eabf510
4480f3c
 
 
935a660
f2e9695
eabf510
 
5b71e40
 
 
 
b5f7332
f5f665e
ee4ac1e
f5f665e
ee5ab0e
 
418bcdd
ee5ab0e
 
f414b62
 
 
ee5ab0e
7bc957d
b26ceaa
d0d62c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import streamlit as st
import plotly.express as px
import pandas as pd
import random
import logging
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset


@st.cache_resource
def load_model(name):
    return SentenceTransformer(name)


@st.cache_data
def load_words_dataset():
    dataset = load_dataset("marksverdhei/wordnet-definitions-en-2021", split="train")
    return dataset["Word"]

@st.cache_data
def choose_secret_word():
    all_words = load_words_dataset()
    return random.choice(all_words)


all_words = load_words_dataset()


model_names = [
    'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    'BAAI/bge-small-en-v1.5'
]

models = {
    name: load_model(name) for name in model_names
}

secret_word =choose_secret_word().lower().strip()
secret_embedding = [models[name].encode(secret_word) for name in model_names]

print("Secret word ", secret_word)


if 'words' not in st.session_state:
    st.session_state['words'] = []




st.write('Try to guess a secret word by semantic similarity')

word = st.text_input("Input a word")

used_words = [w[0] for w in st.session_state['words']]

if st.button("Guess") or word:
    if word not in used_words:
        word_embedding = [models[name].encode(word.lower().strip()) for name in model_names]
        similarities = [util.pytorch_cos_sim(secret_embedding[i], word_embedding[i]).cpu().numpy()[0][0] for i, name in enumerate(model_names)]
        st.session_state['words'].append([str(word)] + similarities)



words_df = pd.DataFrame(
    st.session_state['words'],
    columns=["word"] + ["Similarity for " + name for name in model_names]
).sort_values(by=["Similarity for sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"], ascending=False)
st.dataframe(words_df, use_container_width=True)