chap0lin's picture
Create app.py
f3e7585
raw
history blame
1.87 kB
import gradio as gr
import re
import contractions
import unicodedata
import translators as ts
import numpy as np
import nltk
nltk.download('punkt')
import spacy
spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_sm')
def spacy_lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
def remove_accented_chars(text):
text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
text = re.sub(pattern, '', text)
return text
def remove_stopwords(text, is_lower_case=False, stopwords=None):
if not stopwords:
stopwords = nltk.corpus.stopwords.words('english')
tokens = nltk.word_tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopwords]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def greet(sentence):
opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence))
# sentenceMCTIList_base = nltk.word_tokenize(opo_texto_sem_caracteres_especiais)
sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
sentenceLowered = sentenceWithoutPunctuation.lower()
sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)
return nltk.word_tokenize(sentence)
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()