Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import re | |
import string | |
import joblib | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from wordcloud import WordCloud | |
import nltk | |
from nltk.corpus import stopwords | |
from sklearn.metrics import silhouette_score | |
# Download stopwords if not available | |
nltk.download("stopwords") | |
# Load models and vectorizer | |
kmeans = joblib.load("kmeans_fake_news.pkl") | |
lda = joblib.load("lda_fake_news.pkl") | |
vectorizer = joblib.load("tfidf_vectorizer.pkl") | |
# Load dataset | |
DATASET_URL = "https://www.kaggle.com/datasets/mrisdal/fake-news" | |
fake_df = pd.read_csv("Fake.csv") | |
# Preprocessing | |
stop_words = set(stopwords.words("english")) | |
def clean_text(text): | |
"""Cleans the input text by removing punctuation, numbers, and stopwords.""" | |
text = text.lower() | |
text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation | |
text = re.sub(r"\d+", "", text) # Remove numbers | |
text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords | |
return text | |
fake_df = fake_df[['title', 'text']].dropna() | |
fake_df['content'] = fake_df['title'] + " " + fake_df['text'] | |
fake_df['clean_text'] = fake_df['content'].apply(clean_text) | |
# Transform text into TF-IDF features | |
X = vectorizer.transform(fake_df['clean_text']) | |
fake_df['cluster'] = kmeans.predict(X) | |
# Get top words for LDA topics | |
words = np.array(vectorizer.get_feature_names_out()) | |
top_words = [" ".join(words[np.argsort(topic)][-10:]) for topic in lda.components_] | |
# Sidebar Navigation | |
st.sidebar.title("Navigation") | |
page = st.sidebar.radio("Go to", ["Dataset", "Visualizations", "Model Info", "Model Metrics", "Predictor"]) | |
# Model Information Page | |
if page == "Model Info": | |
st.title("Model Information") | |
st.write("### Machine Learning Models Used") | |
st.markdown( | |
""" | |
- **K-Means Clustering**: Used to group fake news articles into clusters based on their content similarity. | |
- **Latent Dirichlet Allocation (LDA)**: Used for topic modeling to extract the main topics from fake news articles. | |
- **TF-IDF Vectorizer**: Transforms the textual content into numerical features to be used by the models. | |
""" | |
) | |
# Dataset Page | |
elif page == "Dataset": | |
st.title("Fake News Topic Analyzer") | |
st.write("### About the Dataset") | |
st.markdown( | |
""" | |
The dataset contains **fake news articles** collected from multiple sources. | |
It includes titles, article texts, and publishing dates. | |
We use this dataset for **unsupervised clustering and topic modeling**. | |
""" | |
) | |
st.write(f"π **Dataset Source:** [Kaggle: Fake News](<{DATASET_URL}>)") | |
st.write("### Sample Data (Raw)") | |
st.dataframe(fake_df[['title', 'text']].head()) | |
st.write("### Sample Data (Cleaned)") | |
st.dataframe(fake_df[['clean_text']].head()) | |
st.write("### Word Cloud of Most Frequent Words") | |
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(fake_df['clean_text'])) | |
fig, ax = plt.subplots() | |
ax.imshow(wordcloud, interpolation="bilinear") | |
ax.axis("off") | |
st.pyplot(fig) | |
# Visualizations Page | |
elif page == "Visualizations": | |
st.title("Fake News Clustering & Topic Modeling") | |
st.write("### Cluster Distribution") | |
fig, ax = plt.subplots() | |
sns.countplot(x=fake_df['cluster'], ax=ax, palette="viridis") | |
ax.set_xlabel("Cluster") | |
ax.set_ylabel("Number of Articles") | |
st.pyplot(fig) | |
st.write("### Topic Words from LDA") | |
for idx, words in enumerate(top_words): | |
st.write(f"**Topic {idx}:** {words}") | |
# Model Metrics Page | |
elif page == "Model Metrics": | |
st.title("Model Clustering Performance") | |
sil_score = silhouette_score(X, fake_df['cluster']) | |
st.write(f"### Silhouette Score (K-Means Clustering): **{sil_score:.4f}**") | |
st.write("### Sample Articles per Cluster") | |
for cluster_id in sorted(fake_df['cluster'].unique()): | |
st.write(f"#### Cluster {cluster_id} Samples") | |
st.dataframe(fake_df[fake_df['cluster'] == cluster_id][['title', 'text']].head(3)) | |
# Predictor Page | |
elif page == "Predictor": | |
st.title("Fake News Topic Analyzer") | |
user_input = st.text_area("Enter news content:") | |
if st.button("Analyze"): | |
if user_input.strip(): | |
cleaned_input = clean_text(user_input) | |
vectorized_input = vectorizer.transform([cleaned_input]) | |
cluster_pred = kmeans.predict(vectorized_input)[0] | |
topic_pred = np.argmax(lda.transform(vectorized_input)) | |
st.write(f"### Predicted Cluster: {cluster_pred}") | |
# Handle out-of-range topic index | |
if topic_pred < len(top_words): | |
st.write(f"### Predicted Topic: {topic_pred} - {top_words[topic_pred]}") | |
else: | |
st.write(f"### Predicted Topic: {topic_pred} (No keywords available)") |