|
import gradio as gr |
|
import io |
|
import numpy as np |
|
from tok import Tokenizer |
|
|
|
|
|
def load_vectors(fname): |
|
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') |
|
data = {} |
|
for line in fin: |
|
tokens = line.rstrip().split(' ') |
|
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) |
|
del fin |
|
return data, sorted(data.keys(), key=len, reverse=True) |
|
vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec') |
|
|
|
|
|
tokenizer = Tokenizer(protected_words=sorted_vector) |
|
def tokenize(text): |
|
return tokenizer.word_tokenize(text) |
|
|
|
|
|
def onInput(paragraph, progress = gr.Progress()): |
|
progress(0, "Tokenizing...") |
|
tokens = tokenize(paragraph) |
|
|
|
progress(0.1, "Initializing merged vector...") |
|
if not tokens: |
|
return np.zeros(300).tolist() |
|
|
|
merged_vector = np.zeros(300) |
|
|
|
|
|
totalTokens = len(tokens) |
|
for ind, token in enumerate(tokens): |
|
completion = 0.7*((ind+1)/totalTokens) |
|
progress(0.1 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") |
|
|
|
vector = vectors[token] |
|
merged_vector += vector |
|
|
|
|
|
progress(0.9, "Normalizing...") |
|
merged_vector /= len(tokens) |
|
|
|
progress(1, "Converting to list...") |
|
return merged_vector.tolist() |
|
|
|
demo = gr.Interface(fn=onInput, inputs="text", outputs="text") |
|
demo.launch() |