|
import gradio as gr |
|
import re |
|
import contractions |
|
import unicodedata |
|
|
|
import numpy as np |
|
import nltk |
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
|
|
import os |
|
|
|
os.system('python -m spacy download en_core_web_sm') |
|
|
|
import spacy |
|
import en_core_web_sm |
|
nlp = en_core_web_sm.load() |
|
|
|
|
|
def spacy_lemmatize_text(text): |
|
text = nlp(text) |
|
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]) |
|
return text |
|
|
|
def remove_accented_chars(text): |
|
text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') |
|
return text |
|
|
|
def remove_special_characters(text, remove_digits=False): |
|
pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]' |
|
text = re.sub(pattern, '', text) |
|
return text |
|
|
|
def remove_stopwords(text, is_lower_case=False, stopwords=None): |
|
if not stopwords: |
|
stopwords = nltk.corpus.stopwords.words('english') |
|
tokens = nltk.word_tokenize(text) |
|
tokens = [token.strip() for token in tokens] |
|
|
|
if is_lower_case: |
|
filtered_tokens = [token for token in tokens if token not in stopwords] |
|
else: |
|
filtered_tokens = [token for token in tokens if token.lower() not in stopwords] |
|
|
|
filtered_text = ' '.join(filtered_tokens) |
|
return filtered_text |
|
|
|
def greet(sentence): |
|
opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence)) |
|
|
|
sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais) |
|
sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True) |
|
sentenceLowered = sentenceWithoutPunctuation.lower() |
|
sentenceLemmatized = spacy_lemmatize_text(sentenceLowered) |
|
sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False) |
|
|
|
return nltk.word_tokenize(sentenceLemStopped) |
|
|
|
iface = gr.Interface(fn=greet, inputs="text", outputs="text") |
|
iface.launch() |