|
import streamlit as st |
|
import nltk |
|
import spacy |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer, WordNetLemmatizer |
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer |
|
|
|
|
|
nltk.download("punkt") |
|
nltk.download("stopwords") |
|
nltk.download("wordnet") |
|
nltk.download("vader_lexicon") |
|
|
|
|
|
|
|
st.set_page_config(page_title="NLP Basics", page_icon="π€", layout="wide") |
|
st.title("π€ NLP Basics for Beginners") |
|
|
|
st.markdown(""" |
|
### Welcome to the NLP Basics App! |
|
Explore core Natural Language Processing (NLP) concepts with interactive examples, including: |
|
- **Tokenization** (Splitting text into words/sentences) |
|
- **Stopword Removal** (Filtering common words) |
|
- **Lemmatization & Stemming** (Word normalization) |
|
- **Bag of Words (BoW) & TF-IDF** (Vectorization techniques) |
|
- **Named Entity Recognition (NER)** (Extract key entities) |
|
- **Sentiment Analysis** (Understand emotions in text) |
|
- **Text Summarization** (Generate concise summaries) |
|
- **Word Cloud Visualization** (Highlight common words) |
|
""") |
|
|
|
|
|
st.subheader("Enter Text to Analyze") |
|
text_input = st.text_area("Type or paste some text here...", height=150) |
|
|
|
if not text_input.strip(): |
|
st.warning("Please enter some text to explore NLP concepts.") |
|
|
|
|
|
tabs = st.tabs(["Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"]) |
|
|
|
|
|
with tabs[0]: |
|
st.header("π€ Tokenization") |
|
st.write("**Types of Tokenization:**") |
|
st.write("- **Sentence Tokenization**: Splitting text into sentences.") |
|
st.write("- **Word Tokenization**: Splitting text into individual words (tokens).") |
|
st.write("**Example Input**: \"I love NLP. It's amazing!\"") |
|
st.write("**Sentence Tokens**: [\"I love NLP.\", \"It's amazing!\"]") |
|
st.write("**Word Tokens**: [\"I\", \"love\", \"NLP\", \".\", \"It\", \"'s\", \"amazing\", \"!\"]") |
|
if text_input.strip(): |
|
st.write("**Sentence Tokenization**:", sent_tokenize(text_input)) |
|
st.write("**Word Tokenization**:", word_tokenize(text_input)) |
|
|
|
|
|
with tabs[1]: |
|
st.header("π Stopwords Removal") |
|
st.write("**Example Input**: \"This is an example of stopwords removal.\"") |
|
st.write("**Output**: [\"example\", \"stopwords\", \"removal\"]") |
|
if text_input.strip(): |
|
stop_words = set(stopwords.words("english")) |
|
words = word_tokenize(text_input) |
|
filtered_words = [word for word in words if word.lower() not in stop_words] |
|
st.write("**Filtered Words:**", filtered_words) |
|
|
|
|
|
with tabs[2]: |
|
st.header("π± Lemmatization & Stemming") |
|
st.write("**Example Input**: \"running studies\"") |
|
st.write("**Stemmed Output**: [\"run\", \"studi\"]") |
|
st.write("**Lemmatized Output**: [\"run\", \"study\"]") |
|
if text_input.strip(): |
|
words = word_tokenize(text_input) |
|
ps = PorterStemmer() |
|
stemmed_words = [ps.stem(word) for word in words] |
|
st.write("**Stemmed Words:**", stemmed_words) |
|
lemmatizer = WordNetLemmatizer() |
|
lemmatized_words = [lemmatizer.lemmatize(word) for word in words] |
|
st.write("**Lemmatized Words:**", lemmatized_words) |
|
|
|
|
|
with tabs[3]: |
|
st.header("π¦ Bag of Words (BoW)") |
|
st.write("**Example Input**: \"I love NLP. NLP is great!\"") |
|
st.write("**BoW Representation**: { 'I':1, 'love':1, 'NLP':2, 'is':1, 'great':1 }") |
|
if text_input.strip(): |
|
vectorizer = CountVectorizer() |
|
X = vectorizer.fit_transform([text_input]) |
|
st.write("**BoW Matrix:**", X.toarray()) |
|
st.write("**Feature Names:**", vectorizer.get_feature_names_out()) |
|
|
|
|
|
with tabs[4]: |
|
st.header("π TF-IDF (Term Frequency-Inverse Document Frequency)") |
|
st.write("**Example Input**: \"I love NLP. NLP is great!\"") |
|
st.write("**TF-IDF Representation**: Higher weights for rare but important words.") |
|
if text_input.strip(): |
|
tfidf_vectorizer = TfidfVectorizer() |
|
tfidf_matrix = tfidf_vectorizer.fit_transform([text_input]) |
|
st.write("**TF-IDF Matrix:**", tfidf_matrix.toarray()) |
|
st.write("**Feature Names:**", tfidf_vectorizer.get_feature_names_out()) |
|
|
|
|
|
|
|
st.markdown("---") |
|
st.markdown(""" |
|
<center> |
|
<p style='font-size:14px;'>Β© 2024 NLP Basics App. All Rights Reserved.</p> |
|
</center> |
|
""", unsafe_allow_html=True) |
|
|