import streamlit as st
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")


# Streamlit app configuration
st.set_page_config(page_title="NLP Basics", page_icon="🤖", layout="wide")
st.title("🤖 NLP Basics for Beginners")

st.markdown("""
### Welcome to the NLP Basics App!
Explore core Natural Language Processing (NLP) concepts with interactive examples, including:
- **Tokenization** (Splitting text into words/sentences)
- **Stopword Removal** (Filtering common words)
- **Lemmatization & Stemming** (Word normalization)
- **Bag of Words (BoW) & TF-IDF** (Vectorization techniques)
- **Named Entity Recognition (NER)** (Extract key entities)
- **Sentiment Analysis** (Understand emotions in text)
- **Text Summarization** (Generate concise summaries)
- **Word Cloud Visualization** (Highlight common words)
""")

# User Input Text Box
st.subheader("Enter Text to Analyze")
text_input = st.text_area("Type or paste some text here...", height=150)

if not text_input.strip():
    st.warning("Please enter some text to explore NLP concepts.")

# NLP Processing Sections
tabs = st.tabs(["Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"])

# Tokenization
with tabs[0]:
    st.header("🔤 Tokenization")
    st.write("**Types of Tokenization:**")
    st.write("- **Sentence Tokenization**: Splitting text into sentences.")
    st.write("- **Word Tokenization**: Splitting text into individual words (tokens).")
    st.write("**Example Input**: \"I love NLP. It's amazing!\"")
    st.write("**Sentence Tokens**: [\"I love NLP.\", \"It's amazing!\"]")
    st.write("**Word Tokens**: [\"I\", \"love\", \"NLP\", \".\", \"It\", \"'s\", \"amazing\", \"!\"]")
    if text_input.strip():
        st.write("**Sentence Tokenization**:", sent_tokenize(text_input))
        st.write("**Word Tokenization**:", word_tokenize(text_input))

# Stopwords Removal
with tabs[1]:
    st.header("🛑 Stopwords Removal")
    st.write("**Example Input**: \"This is an example of stopwords removal.\"")
    st.write("**Output**: [\"example\", \"stopwords\", \"removal\"]")
    if text_input.strip():
        stop_words = set(stopwords.words("english"))
        words = word_tokenize(text_input)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        st.write("**Filtered Words:**", filtered_words)

# Lemmatization & Stemming
with tabs[2]:
    st.header("🌱 Lemmatization & Stemming")
    st.write("**Example Input**: \"running studies\"")
    st.write("**Stemmed Output**: [\"run\", \"studi\"]")
    st.write("**Lemmatized Output**: [\"run\", \"study\"]")
    if text_input.strip():
        words = word_tokenize(text_input)
        ps = PorterStemmer()
        stemmed_words = [ps.stem(word) for word in words]
        st.write("**Stemmed Words:**", stemmed_words)
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        st.write("**Lemmatized Words:**", lemmatized_words)

# Bag of Words (BoW)
with tabs[3]:
    st.header("📦 Bag of Words (BoW)")
    st.write("**Example Input**: \"I love NLP. NLP is great!\"")
    st.write("**BoW Representation**: { 'I':1, 'love':1, 'NLP':2, 'is':1, 'great':1 }")
    if text_input.strip():
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform([text_input])
        st.write("**BoW Matrix:**", X.toarray())
        st.write("**Feature Names:**", vectorizer.get_feature_names_out())

# TF-IDF
with tabs[4]:
    st.header("📊 TF-IDF (Term Frequency-Inverse Document Frequency)")
    st.write("**Example Input**: \"I love NLP. NLP is great!\"")
    st.write("**TF-IDF Representation**: Higher weights for rare but important words.")
    if text_input.strip():
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([text_input])
        st.write("**TF-IDF Matrix:**", tfidf_matrix.toarray())
        st.write("**Feature Names:**", tfidf_vectorizer.get_feature_names_out())


# Footer
st.markdown("---")
st.markdown("""
<center>
    <p style='font-size:14px;'>© 2024 NLP Basics App. All Rights Reserved.</p>
</center>
""", unsafe_allow_html=True)