import streamlit as st import nltk import spacy from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer, WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Download necessary NLTK data nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") nltk.download("vader_lexicon") # Streamlit app configuration st.set_page_config(page_title="NLP Basics", page_icon="🤖", layout="wide") st.title("🤖 NLP Basics for Beginners") st.markdown(""" ### Welcome to the NLP Basics App! Explore core Natural Language Processing (NLP) concepts with interactive examples, including: - **Tokenization** (Splitting text into words/sentences) - **Stopword Removal** (Filtering common words) - **Lemmatization & Stemming** (Word normalization) - **Bag of Words (BoW) & TF-IDF** (Vectorization techniques) - **Named Entity Recognition (NER)** (Extract key entities) - **Sentiment Analysis** (Understand emotions in text) - **Text Summarization** (Generate concise summaries) - **Word Cloud Visualization** (Highlight common words) """) # User Input Text Box st.subheader("Enter Text to Analyze") text_input = st.text_area("Type or paste some text here...", height=150) if not text_input.strip(): st.warning("Please enter some text to explore NLP concepts.") # NLP Processing Sections tabs = st.tabs(["Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"]) # Tokenization with tabs[0]: st.header("🔤 Tokenization") st.write("**Types of Tokenization:**") st.write("- **Sentence Tokenization**: Splitting text into sentences.") st.write("- **Word Tokenization**: Splitting text into individual words (tokens).") st.write("**Example Input**: \"I love NLP. It's amazing!\"") st.write("**Sentence Tokens**: [\"I love NLP.\", \"It's amazing!\"]") st.write("**Word Tokens**: [\"I\", \"love\", \"NLP\", \".\", \"It\", \"'s\", \"amazing\", \"!\"]") if text_input.strip(): st.write("**Sentence Tokenization**:", sent_tokenize(text_input)) st.write("**Word Tokenization**:", word_tokenize(text_input)) # Stopwords Removal with tabs[1]: st.header("🛑 Stopwords Removal") st.write("**Example Input**: \"This is an example of stopwords removal.\"") st.write("**Output**: [\"example\", \"stopwords\", \"removal\"]") if text_input.strip(): stop_words = set(stopwords.words("english")) words = word_tokenize(text_input) filtered_words = [word for word in words if word.lower() not in stop_words] st.write("**Filtered Words:**", filtered_words) # Lemmatization & Stemming with tabs[2]: st.header("🌱 Lemmatization & Stemming") st.write("**Example Input**: \"running studies\"") st.write("**Stemmed Output**: [\"run\", \"studi\"]") st.write("**Lemmatized Output**: [\"run\", \"study\"]") if text_input.strip(): words = word_tokenize(text_input) ps = PorterStemmer() stemmed_words = [ps.stem(word) for word in words] st.write("**Stemmed Words:**", stemmed_words) lemmatizer = WordNetLemmatizer() lemmatized_words = [lemmatizer.lemmatize(word) for word in words] st.write("**Lemmatized Words:**", lemmatized_words) # Bag of Words (BoW) with tabs[3]: st.header("📦 Bag of Words (BoW)") st.write("**Example Input**: \"I love NLP. NLP is great!\"") st.write("**BoW Representation**: { 'I':1, 'love':1, 'NLP':2, 'is':1, 'great':1 }") if text_input.strip(): vectorizer = CountVectorizer() X = vectorizer.fit_transform([text_input]) st.write("**BoW Matrix:**", X.toarray()) st.write("**Feature Names:**", vectorizer.get_feature_names_out()) # TF-IDF with tabs[4]: st.header("📊 TF-IDF (Term Frequency-Inverse Document Frequency)") st.write("**Example Input**: \"I love NLP. NLP is great!\"") st.write("**TF-IDF Representation**: Higher weights for rare but important words.") if text_input.strip(): tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform([text_input]) st.write("**TF-IDF Matrix:**", tfidf_matrix.toarray()) st.write("**Feature Names:**", tfidf_vectorizer.get_feature_names_out()) # Footer st.markdown("---") st.markdown("""
© 2024 NLP Basics App. All Rights Reserved.