|
import gradio as gr |
|
import io |
|
import numpy as np |
|
import ctypes |
|
|
|
|
|
|
|
def load_vectors(fname): |
|
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') |
|
data = {} |
|
for line in fin: |
|
tokens = line.rstrip().split(' ') |
|
data[tokens[0]] = np.array(list(map(float, tokens[1:]))) |
|
del fin |
|
return data |
|
vectors = load_vectors('wiki-news-300d-1M.vec') |
|
tokens = [token.encode('utf-8') for token in vectors.keys()] |
|
|
|
|
|
lib = ctypes.CDLL('./tokenizer.so') |
|
|
|
lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)] |
|
lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p) |
|
def tokenize(text): |
|
text = text.encode('utf-8') |
|
num_tokens = len(tokens) |
|
tokens_array = (ctypes.c_char_p * num_tokens)(*tokens) |
|
|
|
result_size = ctypes.c_int() |
|
|
|
result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size)) |
|
|
|
python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)] |
|
lib.free_tokens(result, result_size.value) |
|
|
|
return python_tokens |
|
|
|
|
|
def onInput(paragraph): |
|
tokens = tokenize(paragraph) |
|
|
|
if not tokens: |
|
return np.zeros(300).tolist() |
|
|
|
merged_vector = np.zeros(300) |
|
|
|
|
|
totalTokens = len(tokens) |
|
for ind, token in enumerate(tokens): |
|
completion = 0.2*((ind+1)/totalTokens) |
|
|
|
if token not in vectors: |
|
continue |
|
|
|
vector = vectors[token] |
|
merged_vector += vector |
|
|
|
|
|
merged_vector /= len(tokens) |
|
|
|
return merged_vector.tolist() |
|
|
|
demo = gr.Interface(fn=onInput, inputs="text", outputs="text") |
|
demo.launch() |