File size: 1,967 Bytes
f3e7585
 
 
 
 
 
 
 
acd3b9d
f3e7585
ea15f23
 
c59f771
ea15f23
f3e7585
c59f771
 
 
f3e7585
 
 
 
 
 
 
 
 
 
f635e68
f3e7585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e22c63
f3e7585
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import re
import contractions
import unicodedata

import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import os

os.system('python -m spacy download en_core_web_sm')

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
# nlp = spacy.load('en_core_web_sm')

def spacy_lemmatize_text(text):
  text = nlp(text)
  text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
  return text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text
 
def remove_special_characters(text, remove_digits=False):
  pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
  text = re.sub(pattern, '', text)
  return text
  
def remove_stopwords(text, is_lower_case=False, stopwords=None):
  if not stopwords:
      stopwords = nltk.corpus.stopwords.words('english')
  tokens = nltk.word_tokenize(text)
  tokens = [token.strip() for token in tokens]
  
  if is_lower_case:
      filtered_tokens = [token for token in tokens if token not in stopwords]
  else:
      filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
  
  filtered_text = ' '.join(filtered_tokens)    
  return filtered_text

def greet(sentence):
  opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence))
  # sentenceMCTIList_base = nltk.word_tokenize(opo_texto_sem_caracteres_especiais)
  sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
  sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
  sentenceLowered = sentenceWithoutPunctuation.lower()
  sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
  sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)

  return nltk.word_tokenize(sentenceLemStopped)

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()