Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- .gitattributes +1 -0
- Fake.csv +3 -0
- app.py +139 -0
- kmeans_fake_news.pkl +3 -0
- lda_fake_news.pkl +3 -0
- tfidf_vectorizer.pkl +3 -0
- train.ipynb +321 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Fake.csv filter=lfs diff=lfs merge=lfs -text
|
Fake.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bebf8bcfe95678bf2c732bf413a2ce5f621af0102c82bf08083b2e5d3c693d0c
|
3 |
+
size 62789876
|
app.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
import joblib
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import seaborn as sns
|
9 |
+
from wordcloud import WordCloud
|
10 |
+
import nltk
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
from sklearn.metrics import silhouette_score
|
13 |
+
|
14 |
+
# Download stopwords if not available
|
15 |
+
nltk.download("stopwords")
|
16 |
+
|
17 |
+
# Load models and vectorizer
|
18 |
+
kmeans = joblib.load("kmeans_fake_news.pkl")
|
19 |
+
lda = joblib.load("lda_fake_news.pkl")
|
20 |
+
vectorizer = joblib.load("tfidf_vectorizer.pkl")
|
21 |
+
|
22 |
+
# Load dataset
|
23 |
+
DATASET_URL = "https://www.kaggle.com/datasets/mrisdal/fake-news"
|
24 |
+
fake_df = pd.read_csv("Fake.csv")
|
25 |
+
|
26 |
+
# Preprocessing
|
27 |
+
stop_words = set(stopwords.words("english"))
|
28 |
+
|
29 |
+
def clean_text(text):
|
30 |
+
"""Cleans the input text by removing punctuation, numbers, and stopwords."""
|
31 |
+
text = text.lower()
|
32 |
+
text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation
|
33 |
+
text = re.sub(r"\d+", "", text) # Remove numbers
|
34 |
+
text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
|
35 |
+
return text
|
36 |
+
|
37 |
+
fake_df = fake_df[['title', 'text']].dropna()
|
38 |
+
fake_df['content'] = fake_df['title'] + " " + fake_df['text']
|
39 |
+
fake_df['clean_text'] = fake_df['content'].apply(clean_text)
|
40 |
+
|
41 |
+
# Transform text into TF-IDF features
|
42 |
+
X = vectorizer.transform(fake_df['clean_text'])
|
43 |
+
fake_df['cluster'] = kmeans.predict(X)
|
44 |
+
|
45 |
+
# Get top words for LDA topics
|
46 |
+
words = np.array(vectorizer.get_feature_names_out())
|
47 |
+
top_words = [" ".join(words[np.argsort(topic)][-10:]) for topic in lda.components_]
|
48 |
+
|
49 |
+
# Sidebar Navigation
|
50 |
+
st.sidebar.title("Navigation")
|
51 |
+
page = st.sidebar.radio("Go to", ["Dataset", "Visualizations", "Model Info", "Model Metrics", "Predictor"])
|
52 |
+
|
53 |
+
# Model Information Page
|
54 |
+
if page == "Model Info":
|
55 |
+
st.title("Model Information")
|
56 |
+
|
57 |
+
st.write("### Machine Learning Models Used")
|
58 |
+
st.markdown(
|
59 |
+
"""
|
60 |
+
- **K-Means Clustering**: Used to group fake news articles into clusters based on their content similarity.
|
61 |
+
- **Latent Dirichlet Allocation (LDA)**: Used for topic modeling to extract the main topics from fake news articles.
|
62 |
+
- **TF-IDF Vectorizer**: Transforms the textual content into numerical features to be used by the models.
|
63 |
+
"""
|
64 |
+
)
|
65 |
+
|
66 |
+
# Dataset Page
|
67 |
+
elif page == "Dataset":
|
68 |
+
st.title("Fake News Topic Analyzer")
|
69 |
+
|
70 |
+
st.write("### About the Dataset")
|
71 |
+
st.markdown(
|
72 |
+
"""
|
73 |
+
The dataset contains **fake news articles** collected from multiple sources.
|
74 |
+
It includes titles, article texts, and publishing dates.
|
75 |
+
We use this dataset for **unsupervised clustering and topic modeling**.
|
76 |
+
"""
|
77 |
+
)
|
78 |
+
st.write(f"📂 **Dataset Source:** [Kaggle: Fake News](<{DATASET_URL}>)")
|
79 |
+
|
80 |
+
st.write("### Sample Data (Raw)")
|
81 |
+
st.dataframe(fake_df[['title', 'text']].head())
|
82 |
+
|
83 |
+
st.write("### Sample Data (Cleaned)")
|
84 |
+
st.dataframe(fake_df[['clean_text']].head())
|
85 |
+
|
86 |
+
st.write("### Word Cloud of Most Frequent Words")
|
87 |
+
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(fake_df['clean_text']))
|
88 |
+
fig, ax = plt.subplots()
|
89 |
+
ax.imshow(wordcloud, interpolation="bilinear")
|
90 |
+
ax.axis("off")
|
91 |
+
st.pyplot(fig)
|
92 |
+
|
93 |
+
# Visualizations Page
|
94 |
+
elif page == "Visualizations":
|
95 |
+
st.title("Fake News Clustering & Topic Modeling")
|
96 |
+
|
97 |
+
st.write("### Cluster Distribution")
|
98 |
+
fig, ax = plt.subplots()
|
99 |
+
sns.countplot(x=fake_df['cluster'], ax=ax, palette="viridis")
|
100 |
+
ax.set_xlabel("Cluster")
|
101 |
+
ax.set_ylabel("Number of Articles")
|
102 |
+
st.pyplot(fig)
|
103 |
+
|
104 |
+
st.write("### Topic Words from LDA")
|
105 |
+
for idx, words in enumerate(top_words):
|
106 |
+
st.write(f"**Topic {idx}:** {words}")
|
107 |
+
|
108 |
+
# Model Metrics Page
|
109 |
+
elif page == "Model Metrics":
|
110 |
+
st.title("Model Clustering Performance")
|
111 |
+
|
112 |
+
sil_score = silhouette_score(X, fake_df['cluster'])
|
113 |
+
st.write(f"### Silhouette Score (K-Means Clustering): **{sil_score:.4f}**")
|
114 |
+
|
115 |
+
st.write("### Sample Articles per Cluster")
|
116 |
+
for cluster_id in sorted(fake_df['cluster'].unique()):
|
117 |
+
st.write(f"#### Cluster {cluster_id} Samples")
|
118 |
+
st.dataframe(fake_df[fake_df['cluster'] == cluster_id][['title', 'text']].head(3))
|
119 |
+
|
120 |
+
# Predictor Page
|
121 |
+
elif page == "Predictor":
|
122 |
+
st.title("Fake News Topic Analyzer")
|
123 |
+
|
124 |
+
user_input = st.text_area("Enter news content:")
|
125 |
+
|
126 |
+
if st.button("Analyze"):
|
127 |
+
if user_input.strip():
|
128 |
+
cleaned_input = clean_text(user_input)
|
129 |
+
vectorized_input = vectorizer.transform([cleaned_input])
|
130 |
+
cluster_pred = kmeans.predict(vectorized_input)[0]
|
131 |
+
topic_pred = np.argmax(lda.transform(vectorized_input))
|
132 |
+
|
133 |
+
st.write(f"### Predicted Cluster: {cluster_pred}")
|
134 |
+
|
135 |
+
# Handle out-of-range topic index
|
136 |
+
if topic_pred < len(top_words):
|
137 |
+
st.write(f"### Predicted Topic: {topic_pred} - {top_words[topic_pred]}")
|
138 |
+
else:
|
139 |
+
st.write(f"### Predicted Topic: {topic_pred} (No keywords available)")
|
kmeans_fake_news.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a02ff155e5f58800f8bba6f72c45b8e92f6fa35f4ee314f5fe813a705e34c2b4
|
3 |
+
size 214667
|
lda_fake_news.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ab2298fb0f693a053fd395d2baba1d4ad30c191a2a0293338a1471860177551
|
3 |
+
size 406309
|
tfidf_vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b15c1cf1ed716832967a7484647e290a03a20a7e99e1a91686d237437bdd2e75
|
3 |
+
size 184732
|
train.ipynb
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 20,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import numpy as np\n",
|
11 |
+
"import re\n",
|
12 |
+
"import string\n",
|
13 |
+
"import nltk\n",
|
14 |
+
"from nltk.corpus import stopwords\n",
|
15 |
+
"from sklearn.decomposition import LatentDirichletAllocation\n",
|
16 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
17 |
+
"from sklearn.cluster import KMeans\n",
|
18 |
+
"import matplotlib.pyplot as plt\n",
|
19 |
+
"import seaborn as sns\n",
|
20 |
+
"import joblib\n"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [
|
28 |
+
{
|
29 |
+
"name": "stderr",
|
30 |
+
"output_type": "stream",
|
31 |
+
"text": [
|
32 |
+
"[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n",
|
33 |
+
"[nltk_data] Jr\\AppData\\Roaming\\nltk_data...\n",
|
34 |
+
"[nltk_data] Unzipping corpora\\stopwords.zip.\n"
|
35 |
+
]
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"source": [
|
39 |
+
"# Download NLTK stopwords\n",
|
40 |
+
"nltk.download('stopwords')\n",
|
41 |
+
"stop_words = set(stopwords.words('english'))"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 4,
|
47 |
+
"metadata": {},
|
48 |
+
"outputs": [],
|
49 |
+
"source": [
|
50 |
+
"# Load dataset\n",
|
51 |
+
"fake_df = pd.read_csv(\"Fake.csv\")"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": 6,
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [
|
59 |
+
{
|
60 |
+
"name": "stdout",
|
61 |
+
"output_type": "stream",
|
62 |
+
"text": [
|
63 |
+
"Initial Data:\n",
|
64 |
+
" title \\\n",
|
65 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... \n",
|
66 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... \n",
|
67 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... \n",
|
68 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
|
69 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... \n",
|
70 |
+
"\n",
|
71 |
+
" text \n",
|
72 |
+
"0 Donald Trump just couldn t wish all Americans ... \n",
|
73 |
+
"1 House Intelligence Committee Chairman Devin Nu... \n",
|
74 |
+
"2 On Friday, it was revealed that former Milwauk... \n",
|
75 |
+
"3 On Christmas day, Donald Trump announced that ... \n",
|
76 |
+
"4 Pope Francis used his annual Christmas Day mes... \n"
|
77 |
+
]
|
78 |
+
}
|
79 |
+
],
|
80 |
+
"source": [
|
81 |
+
"print(\"Initial Data:\")\n",
|
82 |
+
"print(fake_df.head())"
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "code",
|
87 |
+
"execution_count": 7,
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [
|
90 |
+
{
|
91 |
+
"name": "stdout",
|
92 |
+
"output_type": "stream",
|
93 |
+
"text": [
|
94 |
+
"Data after dropping missing values:\n",
|
95 |
+
" title \\\n",
|
96 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... \n",
|
97 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... \n",
|
98 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... \n",
|
99 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
|
100 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... \n",
|
101 |
+
"\n",
|
102 |
+
" text \n",
|
103 |
+
"0 Donald Trump just couldn t wish all Americans ... \n",
|
104 |
+
"1 House Intelligence Committee Chairman Devin Nu... \n",
|
105 |
+
"2 On Friday, it was revealed that former Milwauk... \n",
|
106 |
+
"3 On Christmas day, Donald Trump announced that ... \n",
|
107 |
+
"4 Pope Francis used his annual Christmas Day mes... \n"
|
108 |
+
]
|
109 |
+
}
|
110 |
+
],
|
111 |
+
"source": [
|
112 |
+
"# Keep only relevant columns\n",
|
113 |
+
"fake_df = fake_df[['title', 'text']].dropna()\n",
|
114 |
+
"print(\"Data after dropping missing values:\")\n",
|
115 |
+
"print(fake_df.head())\n"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 8,
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"# Combine title and text\n",
|
125 |
+
"fake_df['content'] = fake_df['title'] + \" \" + fake_df['text']"
|
126 |
+
]
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"cell_type": "code",
|
130 |
+
"execution_count": 9,
|
131 |
+
"metadata": {},
|
132 |
+
"outputs": [],
|
133 |
+
"source": [
|
134 |
+
"# Function to clean text\n",
|
135 |
+
"def clean_text(text):\n",
|
136 |
+
" text = text.lower()\n",
|
137 |
+
" text = re.sub(f\"[{string.punctuation}]\", \"\", text)\n",
|
138 |
+
" text = re.sub(r\"\\d+\", \"\", text)\n",
|
139 |
+
" text = \" \".join([word for word in text.split() if word not in stop_words])\n",
|
140 |
+
" return text"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "code",
|
145 |
+
"execution_count": 10,
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [
|
148 |
+
{
|
149 |
+
"name": "stdout",
|
150 |
+
"output_type": "stream",
|
151 |
+
"text": [
|
152 |
+
"Data after text cleaning:\n",
|
153 |
+
" content \\\n",
|
154 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... \n",
|
155 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... \n",
|
156 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... \n",
|
157 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
|
158 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... \n",
|
159 |
+
"\n",
|
160 |
+
" clean_text \n",
|
161 |
+
"0 donald trump sends embarrassing new year’s eve... \n",
|
162 |
+
"1 drunk bragging trump staffer started russian c... \n",
|
163 |
+
"2 sheriff david clarke becomes internet joke thr... \n",
|
164 |
+
"3 trump obsessed even obama’s name coded website... \n",
|
165 |
+
"4 pope francis called donald trump christmas spe... \n"
|
166 |
+
]
|
167 |
+
}
|
168 |
+
],
|
169 |
+
"source": [
|
170 |
+
"# Apply text cleaning\n",
|
171 |
+
"fake_df['clean_text'] = fake_df['content'].apply(clean_text)\n",
|
172 |
+
"print(\"Data after text cleaning:\")\n",
|
173 |
+
"print(fake_df[['content', 'clean_text']].head())"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 11,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [],
|
181 |
+
"source": [
|
182 |
+
"# Convert text to TF-IDF vectors\n",
|
183 |
+
"vectorizer = TfidfVectorizer(max_features=5000)\n",
|
184 |
+
"X = vectorizer.fit_transform(fake_df['clean_text'])"
|
185 |
+
]
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"cell_type": "code",
|
189 |
+
"execution_count": 12,
|
190 |
+
"metadata": {},
|
191 |
+
"outputs": [
|
192 |
+
{
|
193 |
+
"name": "stdout",
|
194 |
+
"output_type": "stream",
|
195 |
+
"text": [
|
196 |
+
"Cluster assignments:\n",
|
197 |
+
" title cluster\n",
|
198 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... 2\n",
|
199 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... 2\n",
|
200 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... 1\n",
|
201 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... 2\n",
|
202 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... 1\n"
|
203 |
+
]
|
204 |
+
}
|
205 |
+
],
|
206 |
+
"source": [
|
207 |
+
"# Apply K-Means clustering\n",
|
208 |
+
"num_clusters = 3 # Try clustering articles into 3 groups\n",
|
209 |
+
"kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n",
|
210 |
+
"fake_df['cluster'] = kmeans.fit_predict(X)\n",
|
211 |
+
"print(\"Cluster assignments:\")\n",
|
212 |
+
"print(fake_df[['title', 'cluster']].head())"
|
213 |
+
]
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"cell_type": "code",
|
217 |
+
"execution_count": 13,
|
218 |
+
"metadata": {},
|
219 |
+
"outputs": [
|
220 |
+
{
|
221 |
+
"data": {
|
222 |
+
"image/png": "",
|
223 |
+
"text/plain": [
|
224 |
+
"<Figure size 640x480 with 1 Axes>"
|
225 |
+
]
|
226 |
+
},
|
227 |
+
"metadata": {},
|
228 |
+
"output_type": "display_data"
|
229 |
+
}
|
230 |
+
],
|
231 |
+
"source": [
|
232 |
+
"# Visualizing the clusters\n",
|
233 |
+
"sns.countplot(x=fake_df['cluster'])\n",
|
234 |
+
"plt.title(\"Fake News Clustering\")\n",
|
235 |
+
"plt.show()"
|
236 |
+
]
|
237 |
+
},
|
238 |
+
{
|
239 |
+
"cell_type": "code",
|
240 |
+
"execution_count": 17,
|
241 |
+
"metadata": {},
|
242 |
+
"outputs": [],
|
243 |
+
"source": [
|
244 |
+
"# Apply LDA for topic modeling\n",
|
245 |
+
"num_topics = 5\n",
|
246 |
+
"lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)\n",
|
247 |
+
"topic_matrix = lda.fit_transform(X)\n"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"cell_type": "code",
|
252 |
+
"execution_count": 18,
|
253 |
+
"metadata": {},
|
254 |
+
"outputs": [
|
255 |
+
{
|
256 |
+
"name": "stdout",
|
257 |
+
"output_type": "stream",
|
258 |
+
"text": [
|
259 |
+
"Topic 0: republican said vote state president people republicans would obama trump\n",
|
260 |
+
"Topic 1: students us school gun people video muslim said black police\n",
|
261 |
+
"Topic 2: one said like people clinton president video donald hillary trump\n",
|
262 |
+
"Topic 3: judge maxine jeanine nancy bundy flint waters video moore pelosi\n",
|
263 |
+
"Topic 4: investigation intelligence comey us hillary russian fbi russia clinton trump\n"
|
264 |
+
]
|
265 |
+
}
|
266 |
+
],
|
267 |
+
"source": [
|
268 |
+
"# Show top words for each topic\n",
|
269 |
+
"words = np.array(vectorizer.get_feature_names_out())\n",
|
270 |
+
"top_words = []\n",
|
271 |
+
"for topic_idx, topic in enumerate(lda.components_):\n",
|
272 |
+
" top_words.append(\" \".join(words[np.argsort(topic)][-10:]))\n",
|
273 |
+
" print(f\"Topic {topic_idx}: {top_words[-1]}\")"
|
274 |
+
]
|
275 |
+
},
|
276 |
+
{
|
277 |
+
"cell_type": "code",
|
278 |
+
"execution_count": 21,
|
279 |
+
"metadata": {},
|
280 |
+
"outputs": [
|
281 |
+
{
|
282 |
+
"data": {
|
283 |
+
"text/plain": [
|
284 |
+
"['tfidf_vectorizer.pkl']"
|
285 |
+
]
|
286 |
+
},
|
287 |
+
"execution_count": 21,
|
288 |
+
"metadata": {},
|
289 |
+
"output_type": "execute_result"
|
290 |
+
}
|
291 |
+
],
|
292 |
+
"source": [
|
293 |
+
"# Save model and vectorizer\n",
|
294 |
+
"joblib.dump(kmeans, \"kmeans_fake_news.pkl\")\n",
|
295 |
+
"joblib.dump(lda, \"lda_fake_news.pkl\")\n",
|
296 |
+
"joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n"
|
297 |
+
]
|
298 |
+
}
|
299 |
+
],
|
300 |
+
"metadata": {
|
301 |
+
"kernelspec": {
|
302 |
+
"display_name": "Python 3",
|
303 |
+
"language": "python",
|
304 |
+
"name": "python3"
|
305 |
+
},
|
306 |
+
"language_info": {
|
307 |
+
"codemirror_mode": {
|
308 |
+
"name": "ipython",
|
309 |
+
"version": 3
|
310 |
+
},
|
311 |
+
"file_extension": ".py",
|
312 |
+
"mimetype": "text/x-python",
|
313 |
+
"name": "python",
|
314 |
+
"nbconvert_exporter": "python",
|
315 |
+
"pygments_lexer": "ipython3",
|
316 |
+
"version": "3.11.9"
|
317 |
+
}
|
318 |
+
},
|
319 |
+
"nbformat": 4,
|
320 |
+
"nbformat_minor": 2
|
321 |
+
}
|