Spaces:

Gopi9177
/

NLP

Running

NLP

File size: 4,510 Bytes

ff31579
 
8d56add
ff31579
 
 
 
 
 
 
 
 
8d56add
d092a4f
8d56add
ff31579
 
 
 
 
 
 
 
 
 
 
8d56add
 
 
 
ff31579
 
 
 
 
 
 
 
 
 
79116c1
ff31579
 
 
 
8d56add
 
 
 
 
 
ff31579
8d56add
 
ff31579
 
 
 
8d56add
 
ff31579
 
 
 
 
 
 
 
 
8d56add
 
 
ff31579
 
 
 
 
 
 
 
 
 
 
 
8d56add
 
ff31579
 
 
8d56add
ff31579
 
 
 
 
8d56add
 
ff31579
 
 
8d56add
ff31579
2f0d6e1

import streamlit as st
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")


# Streamlit app configuration
st.set_page_config(page_title="NLP Basics", page_icon="🤖", layout="wide")
st.title("🤖 NLP Basics for Beginners")

st.markdown("""
### Welcome to the NLP Basics App!
Explore core Natural Language Processing (NLP) concepts with interactive examples, including:
- **Tokenization** (Splitting text into words/sentences)
- **Stopword Removal** (Filtering common words)
- **Lemmatization & Stemming** (Word normalization)
- **Bag of Words (BoW) & TF-IDF** (Vectorization techniques)
- **Named Entity Recognition (NER)** (Extract key entities)
- **Sentiment Analysis** (Understand emotions in text)
- **Text Summarization** (Generate concise summaries)
- **Word Cloud Visualization** (Highlight common words)
""")

# User Input Text Box
st.subheader("Enter Text to Analyze")
text_input = st.text_area("Type or paste some text here...", height=150)

if not text_input.strip():
    st.warning("Please enter some text to explore NLP concepts.")

# NLP Processing Sections
tabs = st.tabs(["Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"])

# Tokenization
with tabs[0]:
    st.header("🔤 Tokenization")
    st.write("**Types of Tokenization:**")
    st.write("- **Sentence Tokenization**: Splitting text into sentences.")
    st.write("- **Word Tokenization**: Splitting text into individual words (tokens).")
    st.write("**Example Input**: \"I love NLP. It's amazing!\"")
    st.write("**Sentence Tokens**: [\"I love NLP.\", \"It's amazing!\"]")
    st.write("**Word Tokens**: [\"I\", \"love\", \"NLP\", \".\", \"It\", \"'s\", \"amazing\", \"!\"]")
    if text_input.strip():
        st.write("**Sentence Tokenization**:", sent_tokenize(text_input))
        st.write("**Word Tokenization**:", word_tokenize(text_input))

# Stopwords Removal
with tabs[1]:
    st.header("🛑 Stopwords Removal")
    st.write("**Example Input**: \"This is an example of stopwords removal.\"")
    st.write("**Output**: [\"example\", \"stopwords\", \"removal\"]")
    if text_input.strip():
        stop_words = set(stopwords.words("english"))
        words = word_tokenize(text_input)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        st.write("**Filtered Words:**", filtered_words)

# Lemmatization & Stemming
with tabs[2]:
    st.header("🌱 Lemmatization & Stemming")
    st.write("**Example Input**: \"running studies\"")
    st.write("**Stemmed Output**: [\"run\", \"studi\"]")
    st.write("**Lemmatized Output**: [\"run\", \"study\"]")
    if text_input.strip():
        words = word_tokenize(text_input)
        ps = PorterStemmer()
        stemmed_words = [ps.stem(word) for word in words]
        st.write("**Stemmed Words:**", stemmed_words)
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        st.write("**Lemmatized Words:**", lemmatized_words)

# Bag of Words (BoW)
with tabs[3]:
    st.header("📦 Bag of Words (BoW)")
    st.write("**Example Input**: \"I love NLP. NLP is great!\"")
    st.write("**BoW Representation**: { 'I':1, 'love':1, 'NLP':2, 'is':1, 'great':1 }")
    if text_input.strip():
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform([text_input])
        st.write("**BoW Matrix:**", X.toarray())
        st.write("**Feature Names:**", vectorizer.get_feature_names_out())

# TF-IDF
with tabs[4]:
    st.header("📊 TF-IDF (Term Frequency-Inverse Document Frequency)")
    st.write("**Example Input**: \"I love NLP. NLP is great!\"")
    st.write("**TF-IDF Representation**: Higher weights for rare but important words.")
    if text_input.strip():
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([text_input])
        st.write("**TF-IDF Matrix:**", tfidf_matrix.toarray())
        st.write("**Feature Names:**", tfidf_vectorizer.get_feature_names_out())


# Footer
st.markdown("---")
st.markdown("""
<center>
    <p style='font-size:14px;'>© 2024 NLP Basics App. All Rights Reserved.</p>
</center>
""", unsafe_allow_html=True)