File size: 6,166 Bytes
7dba2e2 be4214a 7dba2e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import streamlit as st
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import LatentDirichletAllocation, NMF
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Page title with emoji
st.markdown("""
<h1 style='text-align: center; color: #FF5733;'>π Techniques of NLP π</h1>
""", unsafe_allow_html=True)
# Text Preprocessing
st.markdown("""
<h2 style='color: #2E86C1;'>πΉ 1. Text Preprocessing</h2>
""", unsafe_allow_html=True)
st.subheader('π Definition:')
st.write("""
Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling.
This includes tasks such as removing unnecessary punctuation, converting text to lowercase,
and handling special characters like emojis.
""")
# Interactive example for preprocessing
text_input = st.text_area("βοΈ Enter text to preprocess", height=150, placeholder="Type or paste some text here...")
col1, col2, col3, col4 = st.columns(4)
with col1:
if st.button('βοΈ Remove Punctuation'):
processed_text = ''.join([char for char in text_input if char not in string.punctuation])
st.success(f"Text without punctuation: {processed_text}")
with col2:
if st.button('π‘ Convert to Lowercase'):
lowercase_text = text_input.lower()
st.success(f"Text in lowercase: {lowercase_text}")
with col3:
if st.button('π Remove Emojis'):
processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
st.success(f"Text without emojis: {processed_text_no_emoji}")
with col4:
if st.button('π« Remove Stopwords'):
words = text_input.split()
filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
st.success(f"Text without stopwords: {filtered_text}")
# Text Vectorization
st.markdown("""
<h2 style='color: #2E86C1;'>π 2. Text Vectorization</h2>
""", unsafe_allow_html=True)
st.subheader('π Definition:')
st.write("""
Text vectorization converts text into numerical form so that machine learning models can process it.
Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
""")
# Interactive example for vectorization
vectorization_choice = st.selectbox('π Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
# New example for vectorization
sample_text = ["Artificial intelligence is transforming the world.", "Natural Language Processing is a subset of AI.", "Machine learning algorithms improve over time!"]
if st.button('π Apply Vectorization'):
vectorizer = CountVectorizer() if vectorization_choice == 'Bag of Words' else TfidfVectorizer()
X = vectorizer.fit_transform(sample_text)
st.write(f"**Vectorized Representation:**\n{X.toarray()}")
st.write(f"**Feature names:** {vectorizer.get_feature_names_out()}")
# Basic Machine Learning
st.markdown("""
<h2 style='color: #2E86C1;'>π€ 3. Basic Machine Learning</h2>
""", unsafe_allow_html=True)
st.subheader('π Definition:')
st.write("""
Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM),
are commonly used for text classification tasks.
""")
# Load dataset
newsgroups = fetch_20newsgroups(subset='train')
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
model_choice = st.selectbox('π€ Choose machine learning model for text classification:',
('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
# Vectorization for classification
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
if st.button('π― Train Model'):
model = {'Naive Bayes': MultinomialNB(), 'Logistic Regression': LogisticRegression(max_iter=1000),
'SVM': SVC(), 'Random Forest': RandomForestClassifier()}[model_choice]
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
st.success(f"π Model Accuracy: {accuracy * 100:.2f}%")
st.text("π Classification Report:")
st.text(classification_report(y_test, y_pred))
# Topic Modeling
st.markdown("""
<h2 style='color: #2E86C1;'>π 4. Topic Modeling</h2>
""", unsafe_allow_html=True)
st.subheader('π Definition:')
st.write("""
Topic modeling is a technique used to identify the underlying topics in a collection of text data.
Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
""")
topic_model_choice = st.selectbox('π Choose topic modeling technique:', ('LDA', 'NMF'))
if st.button('π Run Topic Modeling'):
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X = vectorizer.fit_transform(newsgroups.data)
model = LatentDirichletAllocation(n_components=5, random_state=42) if topic_model_choice == 'LDA' else NMF(n_components=5, random_state=42)
model.fit(X)
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(model.components_):
st.write(f"π **Topic {topic_idx + 1}:**")
top_words_idx = topic.argsort()[:-10 - 1:-1]
top_words = [feature_names[i] for i in top_words_idx]
st.success(", ".join(top_words))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
st.image(wordcloud.to_array(), caption=f"π₯ Word Cloud for Topic {topic_idx + 1}")
|