NLP / pages /Basics Of NLP.py
Gopi9177's picture
Update pages/Basics Of NLP.py
79116c1 verified
import streamlit as st
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")
# Streamlit app configuration
st.set_page_config(page_title="NLP Basics", page_icon="πŸ€–", layout="wide")
st.title("πŸ€– NLP Basics for Beginners")
st.markdown("""
### Welcome to the NLP Basics App!
Explore core Natural Language Processing (NLP) concepts with interactive examples, including:
- **Tokenization** (Splitting text into words/sentences)
- **Stopword Removal** (Filtering common words)
- **Lemmatization & Stemming** (Word normalization)
- **Bag of Words (BoW) & TF-IDF** (Vectorization techniques)
- **Named Entity Recognition (NER)** (Extract key entities)
- **Sentiment Analysis** (Understand emotions in text)
- **Text Summarization** (Generate concise summaries)
- **Word Cloud Visualization** (Highlight common words)
""")
# User Input Text Box
st.subheader("Enter Text to Analyze")
text_input = st.text_area("Type or paste some text here...", height=150)
if not text_input.strip():
st.warning("Please enter some text to explore NLP concepts.")
# NLP Processing Sections
tabs = st.tabs(["Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"])
# Tokenization
with tabs[0]:
st.header("πŸ”€ Tokenization")
st.write("**Types of Tokenization:**")
st.write("- **Sentence Tokenization**: Splitting text into sentences.")
st.write("- **Word Tokenization**: Splitting text into individual words (tokens).")
st.write("**Example Input**: \"I love NLP. It's amazing!\"")
st.write("**Sentence Tokens**: [\"I love NLP.\", \"It's amazing!\"]")
st.write("**Word Tokens**: [\"I\", \"love\", \"NLP\", \".\", \"It\", \"'s\", \"amazing\", \"!\"]")
if text_input.strip():
st.write("**Sentence Tokenization**:", sent_tokenize(text_input))
st.write("**Word Tokenization**:", word_tokenize(text_input))
# Stopwords Removal
with tabs[1]:
st.header("πŸ›‘ Stopwords Removal")
st.write("**Example Input**: \"This is an example of stopwords removal.\"")
st.write("**Output**: [\"example\", \"stopwords\", \"removal\"]")
if text_input.strip():
stop_words = set(stopwords.words("english"))
words = word_tokenize(text_input)
filtered_words = [word for word in words if word.lower() not in stop_words]
st.write("**Filtered Words:**", filtered_words)
# Lemmatization & Stemming
with tabs[2]:
st.header("🌱 Lemmatization & Stemming")
st.write("**Example Input**: \"running studies\"")
st.write("**Stemmed Output**: [\"run\", \"studi\"]")
st.write("**Lemmatized Output**: [\"run\", \"study\"]")
if text_input.strip():
words = word_tokenize(text_input)
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in words]
st.write("**Stemmed Words:**", stemmed_words)
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
st.write("**Lemmatized Words:**", lemmatized_words)
# Bag of Words (BoW)
with tabs[3]:
st.header("πŸ“¦ Bag of Words (BoW)")
st.write("**Example Input**: \"I love NLP. NLP is great!\"")
st.write("**BoW Representation**: { 'I':1, 'love':1, 'NLP':2, 'is':1, 'great':1 }")
if text_input.strip():
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text_input])
st.write("**BoW Matrix:**", X.toarray())
st.write("**Feature Names:**", vectorizer.get_feature_names_out())
# TF-IDF
with tabs[4]:
st.header("πŸ“Š TF-IDF (Term Frequency-Inverse Document Frequency)")
st.write("**Example Input**: \"I love NLP. NLP is great!\"")
st.write("**TF-IDF Representation**: Higher weights for rare but important words.")
if text_input.strip():
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([text_input])
st.write("**TF-IDF Matrix:**", tfidf_matrix.toarray())
st.write("**Feature Names:**", tfidf_vectorizer.get_feature_names_out())
# Footer
st.markdown("---")
st.markdown("""
<center>
<p style='font-size:14px;'>Β© 2024 NLP Basics App. All Rights Reserved.</p>
</center>
""", unsafe_allow_html=True)