Ilde commited on
Commit
effe832
·
1 Parent(s): 1569560

sacar tokenizer de ntlk

Browse files
Files changed (1) hide show
  1. app.py +1 -3
app.py CHANGED
@@ -2,9 +2,7 @@ from gensim.models.keyedvectors import KeyedVectors
2
  import pickle
3
  from nltk.tokenize import word_tokenize
4
  import gradio as gr
5
- import nltk
6
 
7
- nltk.download('punkt')
8
 
9
  # Use gensim Keyvectors to read the embbedings
10
  wordvectors_file_vec = 'smaller_model_spa.txt'
@@ -14,7 +12,7 @@ with open('stop_words.pkl', 'rb') as f:
14
 
15
 
16
  def filter_words(x):
17
- word_tokens = word_tokenize(x, language = "spanish")
18
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
19
  return filtered_sentence
20
 
 
2
  import pickle
3
  from nltk.tokenize import word_tokenize
4
  import gradio as gr
 
5
 
 
6
 
7
  # Use gensim Keyvectors to read the embbedings
8
  wordvectors_file_vec = 'smaller_model_spa.txt'
 
12
 
13
 
14
  def filter_words(x):
15
+ word_tokens = x.split(' ') # shitty tokenization because ntlk tokenize on hf is working weeeird
16
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
17
  return filtered_sentence
18