File size: 1,967 Bytes
f3e7585 acd3b9d f3e7585 ea15f23 c59f771 ea15f23 f3e7585 c59f771 f3e7585 f635e68 f3e7585 1e22c63 f3e7585 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import gradio as gr
import re
import contractions
import unicodedata
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import os
os.system('python -m spacy download en_core_web_sm')
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
# nlp = spacy.load('en_core_web_sm')
def spacy_lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
def remove_accented_chars(text):
text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
text = re.sub(pattern, '', text)
return text
def remove_stopwords(text, is_lower_case=False, stopwords=None):
if not stopwords:
stopwords = nltk.corpus.stopwords.words('english')
tokens = nltk.word_tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopwords]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def greet(sentence):
opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence))
# sentenceMCTIList_base = nltk.word_tokenize(opo_texto_sem_caracteres_especiais)
sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
sentenceLowered = sentenceWithoutPunctuation.lower()
sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)
return nltk.word_tokenize(sentenceLemStopped)
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch() |