Spaces:

Gopi9177
/

NLP

Sleeping

App Files Files Community

Gopi9177 commited on Feb 10

Commit

21ef19f

verified ·

1 Parent(s): 79116c1

Create Techniques of NLP.py

Browse files

Files changed (1) hide show

pages/Techniques of NLP.py +157 -0

pages/Techniques of NLP.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import streamlit as st
+import string
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.decomposition import LatentDirichletAllocation, NMF
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# Download NLTK stopwords
+nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+# Page title
+st.title('Traditional NLP Techniques')
+# Text Preprocessing
+st.header('1. Text Preprocessing')
+st.subheader('Definition:')
+st.write("""
+Text preprocessing is the process of cleaning and preparing raw text for further analysis or modeling.
+This includes tasks such as removing unnecessary punctuation, converting text to lowercase,
+and handling special characters like emojis.
+""")
+# Interactive example for preprocessing
+text_input = st.text_area("Enter text to preprocess", "I love NLP! 😍 This is amazing.")
+# Punctuation removal
+if st.button('Remove Punctuation'):
+    processed_text = ''.join([char for char in text_input if char not in string.punctuation])
+    st.write(f"Text without punctuation: {processed_text}")
+# Convert to lowercase
+if st.button('Convert to Lowercase'):
+    lowercase_text = text_input.lower()
+    st.write(f"Text in lowercase: {lowercase_text}")
+# Handle emojis (replace with a message)
+if st.button('Remove Emojis'):
+    processed_text_no_emoji = ''.join(char for char in text_input if char.isalnum() or char.isspace())
+    st.write(f"Text without emojis: {processed_text_no_emoji}")
+# Stopword removal
+if st.button('Remove Stopwords'):
+    words = text_input.split()
+    filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
+    st.write(f"Text without stopwords: {filtered_text}")
+# Text Vectorization
+st.header('2. Text Vectorization')
+st.subheader('Definition:')
+st.write("""
+Text vectorization converts text into numerical form so that machine learning models can process it.
+Two common techniques are Bag of Words (BoW) and Term Frequency-Inverse Document Frequency (TF-IDF).
+""")
+# Interactive example for vectorization
+vectorization_choice = st.selectbox('Choose vectorization technique:', ('Bag of Words', 'TF-IDF'))
+# Text for vectorization
+sample_text = ["I love programming.", "NLP is fun.", "Streamlit makes things easy!"]
+if st.button('Apply Vectorization'):
+    if vectorization_choice == 'Bag of Words':
+        vectorizer = CountVectorizer()
+    else:
+        vectorizer = TfidfVectorizer()
+    X = vectorizer.fit_transform(sample_text)
+    st.write(f"Vectorized Representation:\n{X.toarray()}")
+    st.write(f"Feature names: {vectorizer.get_feature_names_out()}")
+# Basic Machine Learning
+st.header('3. Basic Machine Learning')
+st.subheader('Definition:')
+st.write("""
+Basic machine learning techniques, such as Naive Bayes, Logistic Regression, and Support Vector Machines (SVM),
+are commonly used for text classification tasks.
+""")
+# Load dataset
+newsgroups = fetch_20newsgroups(subset='train')
+X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3)
+model_choice = st.selectbox('Choose machine learning model for text classification:',
+                            ('Naive Bayes', 'Logistic Regression', 'SVM', 'Random Forest'))
+# Vectorization for classification
+vectorizer = TfidfVectorizer()
+X_train_vec = vectorizer.fit_transform(X_train)
+X_test_vec = vectorizer.transform(X_test)
+# Train model based on choice
+if st.button('Train Model'):
+    if model_choice == 'Naive Bayes':
+        model = MultinomialNB()
+    elif model_choice == 'Logistic Regression':
+        model = LogisticRegression(max_iter=1000)
+    elif model_choice == 'SVM':
+        model = SVC()
+    else:
+        model = RandomForestClassifier()
+    model.fit(X_train_vec, y_train)
+    y_pred = model.predict(X_test_vec)
+    accuracy = accuracy_score(y_test, y_pred)
+    st.write(f"Model Accuracy: {accuracy * 100:.2f}%")
+    st.text("Classification Report:")
+    st.text(classification_report(y_test, y_pred))
+# Topic Modeling
+st.header('4. Topic Modeling')
+st.subheader('Definition:')
+st.write("""
+Topic modeling is a technique used to identify the underlying topics in a collection of text data.
+Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF) are two common techniques for this task.
+""")
+topic_model_choice = st.selectbox('Choose topic modeling technique:', ('LDA', 'NMF'))
+# Apply LDA or NMF for topic modeling
+if st.button('Run Topic Modeling'):
+    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
+    X = vectorizer.fit_transform(newsgroups.data)
+    if topic_model_choice == 'LDA':
+        model = LatentDirichletAllocation(n_components=5, random_state=42)
+    else:
+        model = NMF(n_components=5, random_state=42)
+    model.fit(X)
+    feature_names = vectorizer.get_feature_names_out()
+    # Display top words for each topic
+    for topic_idx, topic in enumerate(model.components_):
+        st.write(f"Topic {topic_idx + 1}:")
+        top_words_idx = topic.argsort()[:-10 - 1:-1]
+        top_words = [feature_names[i] for i in top_words_idx]
+        st.write(", ".join(top_words))
+    # Generate word cloud for topics
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(top_words))
+    st.image(wordcloud.to_array(), caption=f"Word Cloud for Topic {topic_idx + 1}")