Gopi9177 commited on
Commit
21ef19f
·
verified ·
1 Parent(s): 79116c1

Create Techniques of NLP.py

Browse files
Files changed (1) hide show
  1. pages/Techniques of NLP.py +157 -0
pages/Techniques of NLP.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import string
3
+ import numpy as np
4
+ import pandas as pd
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
8
+ from sklearn.naive_bayes import MultinomialNB
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.svm import SVC
11
+ from sklearn.ensemble import RandomForestClassifier
12
+ from sklearn.datasets import fetch_20newsgroups
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.metrics import accuracy_score, classification_report
15
+ from sklearn.decomposition import LatentDirichletAllocation, NMF
16
+ from wordcloud import WordCloud
17
+ import matplotlib.pyplot as plt
18
+
19
+ # Download NLTK stopwords
20
+ nltk.download('stopwords')
21
+ stop_words = set(stopwords.words('english'))
22
+
23
+ # Page title
24
+ st.title('Traditional NLP Techniques')
25
+
26
+ # Text Preprocessing
27
+ st.header('1. Text Preprocessing')
28
+
29
+ st.subheader('Definition:')
30
+ st.write("""
31
+ Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling.
32
+ This includes tasks such as removing unnecessary punctuation, converting text to lowercase,
33
+ and handling special characters like emojis.
34
+ """)
35
+
36
+ # Interactive example for preprocessing
37
+ text_input = st.text_area("Enter text to preprocess", "I love NLP! 😍 This is amazing.")
38
+
39
+ # Punctuation removal
40
+ if st.button('Remove Punctuation'):
41
+ processed_text = ''.join([char for char in text_input if char not in string.punctuation])
42
+ st.write(f"Text without punctuation: {processed_text}")
43
+
44
+ # Convert to lowercase
45
+ if st.button('Convert to Lowercase'):
46
+ lowercase_text = text_input.lower()
47
+ st.write(f"Text in lowercase: {lowercase_text}")
48
+
49
+ # Handle emojis (replace with a message)
50
+ if st.button('Remove Emojis'):
51
+ processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
52
+ st.write(f"Text without emojis: {processed_text_no_emoji}")
53
+
54
+ # Stopword removal
55
+ if st.button('Remove Stopwords'):
56
+ words = text_input.split()
57
+ filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
58
+ st.write(f"Text without stopwords: {filtered_text}")
59
+
60
+ # Text Vectorization
61
+ st.header('2. Text Vectorization')
62
+
63
+ st.subheader('Definition:')
64
+ st.write("""
65
+ Text vectorization converts text into numerical form so that machine learning models can process it.
66
+ Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
67
+ """)
68
+
69
+ # Interactive example for vectorization
70
+ vectorization_choice = st.selectbox('Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
71
+
72
+ # Text for vectorization
73
+ sample_text = ["I love programming.", "NLP is fun.", "Streamlit makes things easy!"]
74
+
75
+ if st.button('Apply Vectorization'):
76
+ if vectorization_choice == 'Bag of Words':
77
+ vectorizer = CountVectorizer()
78
+ else:
79
+ vectorizer = TfidfVectorizer()
80
+ X = vectorizer.fit_transform(sample_text)
81
+ st.write(f"Vectorized Representation:\n{X.toarray()}")
82
+ st.write(f"Feature names: {vectorizer.get_feature_names_out()}")
83
+
84
+ # Basic Machine Learning
85
+ st.header('3. Basic Machine Learning')
86
+
87
+ st.subheader('Definition:')
88
+ st.write("""
89
+ Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM),
90
+ are commonly used for text classification tasks.
91
+ """)
92
+
93
+ # Load dataset
94
+ newsgroups = fetch_20newsgroups(subset='train')
95
+ X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
96
+
97
+ model_choice = st.selectbox('Choose machine learning model for text classification:',
98
+ ('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
99
+
100
+ # Vectorization for classification
101
+ vectorizer = TfidfVectorizer()
102
+ X_train_vec = vectorizer.fit_transform(X_train)
103
+ X_test_vec = vectorizer.transform(X_test)
104
+
105
+ # Train model based on choice
106
+ if st.button('Train Model'):
107
+ if model_choice == 'Naive Bayes':
108
+ model = MultinomialNB()
109
+ elif model_choice == 'Logistic Regression':
110
+ model = LogisticRegression(max_iter=1000)
111
+ elif model_choice == 'SVM':
112
+ model = SVC()
113
+ else:
114
+ model = RandomForestClassifier()
115
+
116
+ model.fit(X_train_vec, y_train)
117
+ y_pred = model.predict(X_test_vec)
118
+
119
+ accuracy = accuracy_score(y_test, y_pred)
120
+ st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
121
+ st.text("Classification Report:")
122
+ st.text(classification_report(y_test, y_pred))
123
+
124
+ # Topic Modeling
125
+ st.header('4. Topic Modeling')
126
+
127
+ st.subheader('Definition:')
128
+ st.write("""
129
+ Topic modeling is a technique used to identify the underlying topics in a collection of text data.
130
+ Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
131
+ """)
132
+
133
+ topic_model_choice = st.selectbox('Choose topic modeling technique:', ('LDA', 'NMF'))
134
+
135
+ # Apply LDA or NMF for topic modeling
136
+ if st.button('Run Topic Modeling'):
137
+ vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
138
+ X = vectorizer.fit_transform(newsgroups.data)
139
+
140
+ if topic_model_choice == 'LDA':
141
+ model = LatentDirichletAllocation(n_components=5, random_state=42)
142
+ else:
143
+ model = NMF(n_components=5, random_state=42)
144
+
145
+ model.fit(X)
146
+ feature_names = vectorizer.get_feature_names_out()
147
+
148
+ # Display top words for each topic
149
+ for topic_idx, topic in enumerate(model.components_):
150
+ st.write(f"Topic {topic_idx + 1}:")
151
+ top_words_idx = topic.argsort()[:-10 - 1:-1]
152
+ top_words = [feature_names[i] for i in top_words_idx]
153
+ st.write(", ".join(top_words))
154
+
155
+ # Generate word cloud for topics
156
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
157
+ st.image(wordcloud.to_array(), caption=f"Word Cloud for Topic {topic_idx + 1}")