Create Techniques of NLP.py
Browse files- pages/Techniques of NLP.py +157 -0
pages/Techniques of NLP.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import string
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import nltk
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
8 |
+
from sklearn.naive_bayes import MultinomialNB
|
9 |
+
from sklearn.linear_model import LogisticRegression
|
10 |
+
from sklearn.svm import SVC
|
11 |
+
from sklearn.ensemble import RandomForestClassifier
|
12 |
+
from sklearn.datasets import fetch_20newsgroups
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from sklearn.metrics import accuracy_score, classification_report
|
15 |
+
from sklearn.decomposition import LatentDirichletAllocation, NMF
|
16 |
+
from wordcloud import WordCloud
|
17 |
+
import matplotlib.pyplot as plt
|
18 |
+
|
19 |
+
# Download NLTK stopwords
|
20 |
+
nltk.download('stopwords')
|
21 |
+
stop_words = set(stopwords.words('english'))
|
22 |
+
|
23 |
+
# Page title
|
24 |
+
st.title('Traditional NLP Techniques')
|
25 |
+
|
26 |
+
# Text Preprocessing
|
27 |
+
st.header('1. Text Preprocessing')
|
28 |
+
|
29 |
+
st.subheader('Definition:')
|
30 |
+
st.write("""
|
31 |
+
Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling.
|
32 |
+
This includes tasks such as removing unnecessary punctuation, converting text to lowercase,
|
33 |
+
and handling special characters like emojis.
|
34 |
+
""")
|
35 |
+
|
36 |
+
# Interactive example for preprocessing
|
37 |
+
text_input = st.text_area("Enter text to preprocess", "I love NLP! 😍 This is amazing.")
|
38 |
+
|
39 |
+
# Punctuation removal
|
40 |
+
if st.button('Remove Punctuation'):
|
41 |
+
processed_text = ''.join([char for char in text_input if char not in string.punctuation])
|
42 |
+
st.write(f"Text without punctuation: {processed_text}")
|
43 |
+
|
44 |
+
# Convert to lowercase
|
45 |
+
if st.button('Convert to Lowercase'):
|
46 |
+
lowercase_text = text_input.lower()
|
47 |
+
st.write(f"Text in lowercase: {lowercase_text}")
|
48 |
+
|
49 |
+
# Handle emojis (replace with a message)
|
50 |
+
if st.button('Remove Emojis'):
|
51 |
+
processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
|
52 |
+
st.write(f"Text without emojis: {processed_text_no_emoji}")
|
53 |
+
|
54 |
+
# Stopword removal
|
55 |
+
if st.button('Remove Stopwords'):
|
56 |
+
words = text_input.split()
|
57 |
+
filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
|
58 |
+
st.write(f"Text without stopwords: {filtered_text}")
|
59 |
+
|
60 |
+
# Text Vectorization
|
61 |
+
st.header('2. Text Vectorization')
|
62 |
+
|
63 |
+
st.subheader('Definition:')
|
64 |
+
st.write("""
|
65 |
+
Text vectorization converts text into numerical form so that machine learning models can process it.
|
66 |
+
Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
|
67 |
+
""")
|
68 |
+
|
69 |
+
# Interactive example for vectorization
|
70 |
+
vectorization_choice = st.selectbox('Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
|
71 |
+
|
72 |
+
# Text for vectorization
|
73 |
+
sample_text = ["I love programming.", "NLP is fun.", "Streamlit makes things easy!"]
|
74 |
+
|
75 |
+
if st.button('Apply Vectorization'):
|
76 |
+
if vectorization_choice == 'Bag of Words':
|
77 |
+
vectorizer = CountVectorizer()
|
78 |
+
else:
|
79 |
+
vectorizer = TfidfVectorizer()
|
80 |
+
X = vectorizer.fit_transform(sample_text)
|
81 |
+
st.write(f"Vectorized Representation:\n{X.toarray()}")
|
82 |
+
st.write(f"Feature names: {vectorizer.get_feature_names_out()}")
|
83 |
+
|
84 |
+
# Basic Machine Learning
|
85 |
+
st.header('3. Basic Machine Learning')
|
86 |
+
|
87 |
+
st.subheader('Definition:')
|
88 |
+
st.write("""
|
89 |
+
Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM),
|
90 |
+
are commonly used for text classification tasks.
|
91 |
+
""")
|
92 |
+
|
93 |
+
# Load dataset
|
94 |
+
newsgroups = fetch_20newsgroups(subset='train')
|
95 |
+
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
|
96 |
+
|
97 |
+
model_choice = st.selectbox('Choose machine learning model for text classification:',
|
98 |
+
('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
|
99 |
+
|
100 |
+
# Vectorization for classification
|
101 |
+
vectorizer = TfidfVectorizer()
|
102 |
+
X_train_vec = vectorizer.fit_transform(X_train)
|
103 |
+
X_test_vec = vectorizer.transform(X_test)
|
104 |
+
|
105 |
+
# Train model based on choice
|
106 |
+
if st.button('Train Model'):
|
107 |
+
if model_choice == 'Naive Bayes':
|
108 |
+
model = MultinomialNB()
|
109 |
+
elif model_choice == 'Logistic Regression':
|
110 |
+
model = LogisticRegression(max_iter=1000)
|
111 |
+
elif model_choice == 'SVM':
|
112 |
+
model = SVC()
|
113 |
+
else:
|
114 |
+
model = RandomForestClassifier()
|
115 |
+
|
116 |
+
model.fit(X_train_vec, y_train)
|
117 |
+
y_pred = model.predict(X_test_vec)
|
118 |
+
|
119 |
+
accuracy = accuracy_score(y_test, y_pred)
|
120 |
+
st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
|
121 |
+
st.text("Classification Report:")
|
122 |
+
st.text(classification_report(y_test, y_pred))
|
123 |
+
|
124 |
+
# Topic Modeling
|
125 |
+
st.header('4. Topic Modeling')
|
126 |
+
|
127 |
+
st.subheader('Definition:')
|
128 |
+
st.write("""
|
129 |
+
Topic modeling is a technique used to identify the underlying topics in a collection of text data.
|
130 |
+
Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
|
131 |
+
""")
|
132 |
+
|
133 |
+
topic_model_choice = st.selectbox('Choose topic modeling technique:', ('LDA', 'NMF'))
|
134 |
+
|
135 |
+
# Apply LDA or NMF for topic modeling
|
136 |
+
if st.button('Run Topic Modeling'):
|
137 |
+
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
|
138 |
+
X = vectorizer.fit_transform(newsgroups.data)
|
139 |
+
|
140 |
+
if topic_model_choice == 'LDA':
|
141 |
+
model = LatentDirichletAllocation(n_components=5, random_state=42)
|
142 |
+
else:
|
143 |
+
model = NMF(n_components=5, random_state=42)
|
144 |
+
|
145 |
+
model.fit(X)
|
146 |
+
feature_names = vectorizer.get_feature_names_out()
|
147 |
+
|
148 |
+
# Display top words for each topic
|
149 |
+
for topic_idx, topic in enumerate(model.components_):
|
150 |
+
st.write(f"Topic {topic_idx + 1}:")
|
151 |
+
top_words_idx = topic.argsort()[:-10 - 1:-1]
|
152 |
+
top_words = [feature_names[i] for i in top_words_idx]
|
153 |
+
st.write(", ".join(top_words))
|
154 |
+
|
155 |
+
# Generate word cloud for topics
|
156 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
|
157 |
+
st.image(wordcloud.to_array(), caption=f"Word Cloud for Topic {topic_idx + 1}")
|