File size: 4,510 Bytes
ff31579
 
8d56add
ff31579
 
 
 
 
 
 
 
 
8d56add
d092a4f
8d56add
ff31579
 
 
 
 
 
 
 
 
 
 
8d56add
 
 
 
ff31579
 
 
 
 
 
 
 
 
 
79116c1
ff31579
 
 
 
8d56add
 
 
 
 
 
ff31579
8d56add
 
ff31579
 
 
 
8d56add
 
ff31579
 
 
 
 
 
 
 
 
8d56add
 
 
ff31579
 
 
 
 
 
 
 
 
 
 
 
8d56add
 
ff31579
 
 
8d56add
ff31579
 
 
 
 
8d56add
 
ff31579
 
 
8d56add
ff31579
2f0d6e1
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import streamlit as st
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")


# Streamlit app configuration
st.set_page_config(page_title="NLP Basics", page_icon="πŸ€–", layout="wide")
st.title("πŸ€– NLP Basics for Beginners")

st.markdown("""
### Welcome to the NLP Basics App!
Explore core Natural Language Processing (NLP) concepts with interactive examples, including:
- **Tokenization** (Splitting text into words/sentences)
- **Stopword Removal** (Filtering common words)
- **Lemmatization & Stemming** (Word normalization)
- **Bag of Words (BoW) & TF-IDF** (Vectorization techniques)
- **Named Entity Recognition (NER)** (Extract key entities)
- **Sentiment Analysis** (Understand emotions in text)
- **Text Summarization** (Generate concise summaries)
- **Word Cloud Visualization** (Highlight common words)
""")

# User Input Text Box
st.subheader("Enter Text to Analyze")
text_input = st.text_area("Type or paste some text here...", height=150)

if not text_input.strip():
    st.warning("Please enter some text to explore NLP concepts.")

# NLP Processing Sections
tabs = st.tabs(["Tokenization", "Stopwords", "Lemmatization & Stemming", "Bag of Words (BoW)", "TF-IDF"])

# Tokenization
with tabs[0]:
    st.header("πŸ”€ Tokenization")
    st.write("**Types of Tokenization:**")
    st.write("- **Sentence Tokenization**: Splitting text into sentences.")
    st.write("- **Word Tokenization**: Splitting text into individual words (tokens).")
    st.write("**Example Input**: \"I love NLP. It's amazing!\"")
    st.write("**Sentence Tokens**: [\"I love NLP.\", \"It's amazing!\"]")
    st.write("**Word Tokens**: [\"I\", \"love\", \"NLP\", \".\", \"It\", \"'s\", \"amazing\", \"!\"]")
    if text_input.strip():
        st.write("**Sentence Tokenization**:", sent_tokenize(text_input))
        st.write("**Word Tokenization**:", word_tokenize(text_input))

# Stopwords Removal
with tabs[1]:
    st.header("πŸ›‘ Stopwords Removal")
    st.write("**Example Input**: \"This is an example of stopwords removal.\"")
    st.write("**Output**: [\"example\", \"stopwords\", \"removal\"]")
    if text_input.strip():
        stop_words = set(stopwords.words("english"))
        words = word_tokenize(text_input)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        st.write("**Filtered Words:**", filtered_words)

# Lemmatization & Stemming
with tabs[2]:
    st.header("🌱 Lemmatization & Stemming")
    st.write("**Example Input**: \"running studies\"")
    st.write("**Stemmed Output**: [\"run\", \"studi\"]")
    st.write("**Lemmatized Output**: [\"run\", \"study\"]")
    if text_input.strip():
        words = word_tokenize(text_input)
        ps = PorterStemmer()
        stemmed_words = [ps.stem(word) for word in words]
        st.write("**Stemmed Words:**", stemmed_words)
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        st.write("**Lemmatized Words:**", lemmatized_words)

# Bag of Words (BoW)
with tabs[3]:
    st.header("πŸ“¦ Bag of Words (BoW)")
    st.write("**Example Input**: \"I love NLP. NLP is great!\"")
    st.write("**BoW Representation**: { 'I':1, 'love':1, 'NLP':2, 'is':1, 'great':1 }")
    if text_input.strip():
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform([text_input])
        st.write("**BoW Matrix:**", X.toarray())
        st.write("**Feature Names:**", vectorizer.get_feature_names_out())

# TF-IDF
with tabs[4]:
    st.header("πŸ“Š TF-IDF (Term Frequency-Inverse Document Frequency)")
    st.write("**Example Input**: \"I love NLP. NLP is great!\"")
    st.write("**TF-IDF Representation**: Higher weights for rare but important words.")
    if text_input.strip():
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([text_input])
        st.write("**TF-IDF Matrix:**", tfidf_matrix.toarray())
        st.write("**Feature Names:**", tfidf_vectorizer.get_feature_names_out())


# Footer
st.markdown("---")
st.markdown("""
<center>
    <p style='font-size:14px;'>Β© 2024 NLP Basics App. All Rights Reserved.</p>
</center>
""", unsafe_allow_html=True)