Rejeno commited on
Commit
d2ab310
·
verified ·
1 Parent(s): 0732a0a

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. Fake.csv +3 -0
  3. app.py +139 -0
  4. kmeans_fake_news.pkl +3 -0
  5. lda_fake_news.pkl +3 -0
  6. tfidf_vectorizer.pkl +3 -0
  7. train.ipynb +321 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Fake.csv filter=lfs diff=lfs merge=lfs -text
Fake.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bebf8bcfe95678bf2c732bf413a2ce5f621af0102c82bf08083b2e5d3c693d0c
3
+ size 62789876
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import string
6
+ import joblib
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ from wordcloud import WordCloud
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from sklearn.metrics import silhouette_score
13
+
14
+ # Download stopwords if not available
15
+ nltk.download("stopwords")
16
+
17
+ # Load models and vectorizer
18
+ kmeans = joblib.load("kmeans_fake_news.pkl")
19
+ lda = joblib.load("lda_fake_news.pkl")
20
+ vectorizer = joblib.load("tfidf_vectorizer.pkl")
21
+
22
+ # Load dataset
23
+ DATASET_URL = "https://www.kaggle.com/datasets/mrisdal/fake-news"
24
+ fake_df = pd.read_csv("Fake.csv")
25
+
26
+ # Preprocessing
27
+ stop_words = set(stopwords.words("english"))
28
+
29
+ def clean_text(text):
30
+ """Cleans the input text by removing punctuation, numbers, and stopwords."""
31
+ text = text.lower()
32
+ text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation
33
+ text = re.sub(r"\d+", "", text) # Remove numbers
34
+ text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
35
+ return text
36
+
37
+ fake_df = fake_df[['title', 'text']].dropna()
38
+ fake_df['content'] = fake_df['title'] + " " + fake_df['text']
39
+ fake_df['clean_text'] = fake_df['content'].apply(clean_text)
40
+
41
+ # Transform text into TF-IDF features
42
+ X = vectorizer.transform(fake_df['clean_text'])
43
+ fake_df['cluster'] = kmeans.predict(X)
44
+
45
+ # Get top words for LDA topics
46
+ words = np.array(vectorizer.get_feature_names_out())
47
+ top_words = [" ".join(words[np.argsort(topic)][-10:]) for topic in lda.components_]
48
+
49
+ # Sidebar Navigation
50
+ st.sidebar.title("Navigation")
51
+ page = st.sidebar.radio("Go to", ["Dataset", "Visualizations", "Model Info", "Model Metrics", "Predictor"])
52
+
53
+ # Model Information Page
54
+ if page == "Model Info":
55
+ st.title("Model Information")
56
+
57
+ st.write("### Machine Learning Models Used")
58
+ st.markdown(
59
+ """
60
+ - **K-Means Clustering**: Used to group fake news articles into clusters based on their content similarity.
61
+ - **Latent Dirichlet Allocation (LDA)**: Used for topic modeling to extract the main topics from fake news articles.
62
+ - **TF-IDF Vectorizer**: Transforms the textual content into numerical features to be used by the models.
63
+ """
64
+ )
65
+
66
+ # Dataset Page
67
+ elif page == "Dataset":
68
+ st.title("Fake News Topic Analyzer")
69
+
70
+ st.write("### About the Dataset")
71
+ st.markdown(
72
+ """
73
+ The dataset contains **fake news articles** collected from multiple sources.
74
+ It includes titles, article texts, and publishing dates.
75
+ We use this dataset for **unsupervised clustering and topic modeling**.
76
+ """
77
+ )
78
+ st.write(f"📂 **Dataset Source:** [Kaggle: Fake News](<{DATASET_URL}>)")
79
+
80
+ st.write("### Sample Data (Raw)")
81
+ st.dataframe(fake_df[['title', 'text']].head())
82
+
83
+ st.write("### Sample Data (Cleaned)")
84
+ st.dataframe(fake_df[['clean_text']].head())
85
+
86
+ st.write("### Word Cloud of Most Frequent Words")
87
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(fake_df['clean_text']))
88
+ fig, ax = plt.subplots()
89
+ ax.imshow(wordcloud, interpolation="bilinear")
90
+ ax.axis("off")
91
+ st.pyplot(fig)
92
+
93
+ # Visualizations Page
94
+ elif page == "Visualizations":
95
+ st.title("Fake News Clustering & Topic Modeling")
96
+
97
+ st.write("### Cluster Distribution")
98
+ fig, ax = plt.subplots()
99
+ sns.countplot(x=fake_df['cluster'], ax=ax, palette="viridis")
100
+ ax.set_xlabel("Cluster")
101
+ ax.set_ylabel("Number of Articles")
102
+ st.pyplot(fig)
103
+
104
+ st.write("### Topic Words from LDA")
105
+ for idx, words in enumerate(top_words):
106
+ st.write(f"**Topic {idx}:** {words}")
107
+
108
+ # Model Metrics Page
109
+ elif page == "Model Metrics":
110
+ st.title("Model Clustering Performance")
111
+
112
+ sil_score = silhouette_score(X, fake_df['cluster'])
113
+ st.write(f"### Silhouette Score (K-Means Clustering): **{sil_score:.4f}**")
114
+
115
+ st.write("### Sample Articles per Cluster")
116
+ for cluster_id in sorted(fake_df['cluster'].unique()):
117
+ st.write(f"#### Cluster {cluster_id} Samples")
118
+ st.dataframe(fake_df[fake_df['cluster'] == cluster_id][['title', 'text']].head(3))
119
+
120
+ # Predictor Page
121
+ elif page == "Predictor":
122
+ st.title("Fake News Topic Analyzer")
123
+
124
+ user_input = st.text_area("Enter news content:")
125
+
126
+ if st.button("Analyze"):
127
+ if user_input.strip():
128
+ cleaned_input = clean_text(user_input)
129
+ vectorized_input = vectorizer.transform([cleaned_input])
130
+ cluster_pred = kmeans.predict(vectorized_input)[0]
131
+ topic_pred = np.argmax(lda.transform(vectorized_input))
132
+
133
+ st.write(f"### Predicted Cluster: {cluster_pred}")
134
+
135
+ # Handle out-of-range topic index
136
+ if topic_pred < len(top_words):
137
+ st.write(f"### Predicted Topic: {topic_pred} - {top_words[topic_pred]}")
138
+ else:
139
+ st.write(f"### Predicted Topic: {topic_pred} (No keywords available)")
kmeans_fake_news.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a02ff155e5f58800f8bba6f72c45b8e92f6fa35f4ee314f5fe813a705e34c2b4
3
+ size 214667
lda_fake_news.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab2298fb0f693a053fd395d2baba1d4ad30c191a2a0293338a1471860177551
3
+ size 406309
tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b15c1cf1ed716832967a7484647e290a03a20a7e99e1a91686d237437bdd2e75
3
+ size 184732
train.ipynb ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 20,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np\n",
11
+ "import re\n",
12
+ "import string\n",
13
+ "import nltk\n",
14
+ "from nltk.corpus import stopwords\n",
15
+ "from sklearn.decomposition import LatentDirichletAllocation\n",
16
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
17
+ "from sklearn.cluster import KMeans\n",
18
+ "import matplotlib.pyplot as plt\n",
19
+ "import seaborn as sns\n",
20
+ "import joblib\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stderr",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n",
33
+ "[nltk_data] Jr\\AppData\\Roaming\\nltk_data...\n",
34
+ "[nltk_data] Unzipping corpora\\stopwords.zip.\n"
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "# Download NLTK stopwords\n",
40
+ "nltk.download('stopwords')\n",
41
+ "stop_words = set(stopwords.words('english'))"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 4,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "# Load dataset\n",
51
+ "fake_df = pd.read_csv(\"Fake.csv\")"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 6,
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "name": "stdout",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "Initial Data:\n",
64
+ " title \\\n",
65
+ "0 Donald Trump Sends Out Embarrassing New Year’... \n",
66
+ "1 Drunk Bragging Trump Staffer Started Russian ... \n",
67
+ "2 Sheriff David Clarke Becomes An Internet Joke... \n",
68
+ "3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
69
+ "4 Pope Francis Just Called Out Donald Trump Dur... \n",
70
+ "\n",
71
+ " text \n",
72
+ "0 Donald Trump just couldn t wish all Americans ... \n",
73
+ "1 House Intelligence Committee Chairman Devin Nu... \n",
74
+ "2 On Friday, it was revealed that former Milwauk... \n",
75
+ "3 On Christmas day, Donald Trump announced that ... \n",
76
+ "4 Pope Francis used his annual Christmas Day mes... \n"
77
+ ]
78
+ }
79
+ ],
80
+ "source": [
81
+ "print(\"Initial Data:\")\n",
82
+ "print(fake_df.head())"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 7,
88
+ "metadata": {},
89
+ "outputs": [
90
+ {
91
+ "name": "stdout",
92
+ "output_type": "stream",
93
+ "text": [
94
+ "Data after dropping missing values:\n",
95
+ " title \\\n",
96
+ "0 Donald Trump Sends Out Embarrassing New Year’... \n",
97
+ "1 Drunk Bragging Trump Staffer Started Russian ... \n",
98
+ "2 Sheriff David Clarke Becomes An Internet Joke... \n",
99
+ "3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
100
+ "4 Pope Francis Just Called Out Donald Trump Dur... \n",
101
+ "\n",
102
+ " text \n",
103
+ "0 Donald Trump just couldn t wish all Americans ... \n",
104
+ "1 House Intelligence Committee Chairman Devin Nu... \n",
105
+ "2 On Friday, it was revealed that former Milwauk... \n",
106
+ "3 On Christmas day, Donald Trump announced that ... \n",
107
+ "4 Pope Francis used his annual Christmas Day mes... \n"
108
+ ]
109
+ }
110
+ ],
111
+ "source": [
112
+ "# Keep only relevant columns\n",
113
+ "fake_df = fake_df[['title', 'text']].dropna()\n",
114
+ "print(\"Data after dropping missing values:\")\n",
115
+ "print(fake_df.head())\n"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 8,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "# Combine title and text\n",
125
+ "fake_df['content'] = fake_df['title'] + \" \" + fake_df['text']"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 9,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "# Function to clean text\n",
135
+ "def clean_text(text):\n",
136
+ " text = text.lower()\n",
137
+ " text = re.sub(f\"[{string.punctuation}]\", \"\", text)\n",
138
+ " text = re.sub(r\"\\d+\", \"\", text)\n",
139
+ " text = \" \".join([word for word in text.split() if word not in stop_words])\n",
140
+ " return text"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 10,
146
+ "metadata": {},
147
+ "outputs": [
148
+ {
149
+ "name": "stdout",
150
+ "output_type": "stream",
151
+ "text": [
152
+ "Data after text cleaning:\n",
153
+ " content \\\n",
154
+ "0 Donald Trump Sends Out Embarrassing New Year’... \n",
155
+ "1 Drunk Bragging Trump Staffer Started Russian ... \n",
156
+ "2 Sheriff David Clarke Becomes An Internet Joke... \n",
157
+ "3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
158
+ "4 Pope Francis Just Called Out Donald Trump Dur... \n",
159
+ "\n",
160
+ " clean_text \n",
161
+ "0 donald trump sends embarrassing new year’s eve... \n",
162
+ "1 drunk bragging trump staffer started russian c... \n",
163
+ "2 sheriff david clarke becomes internet joke thr... \n",
164
+ "3 trump obsessed even obama’s name coded website... \n",
165
+ "4 pope francis called donald trump christmas spe... \n"
166
+ ]
167
+ }
168
+ ],
169
+ "source": [
170
+ "# Apply text cleaning\n",
171
+ "fake_df['clean_text'] = fake_df['content'].apply(clean_text)\n",
172
+ "print(\"Data after text cleaning:\")\n",
173
+ "print(fake_df[['content', 'clean_text']].head())"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 11,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "# Convert text to TF-IDF vectors\n",
183
+ "vectorizer = TfidfVectorizer(max_features=5000)\n",
184
+ "X = vectorizer.fit_transform(fake_df['clean_text'])"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 12,
190
+ "metadata": {},
191
+ "outputs": [
192
+ {
193
+ "name": "stdout",
194
+ "output_type": "stream",
195
+ "text": [
196
+ "Cluster assignments:\n",
197
+ " title cluster\n",
198
+ "0 Donald Trump Sends Out Embarrassing New Year’... 2\n",
199
+ "1 Drunk Bragging Trump Staffer Started Russian ... 2\n",
200
+ "2 Sheriff David Clarke Becomes An Internet Joke... 1\n",
201
+ "3 Trump Is So Obsessed He Even Has Obama’s Name... 2\n",
202
+ "4 Pope Francis Just Called Out Donald Trump Dur... 1\n"
203
+ ]
204
+ }
205
+ ],
206
+ "source": [
207
+ "# Apply K-Means clustering\n",
208
+ "num_clusters = 3 # Try clustering articles into 3 groups\n",
209
+ "kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n",
210
+ "fake_df['cluster'] = kmeans.fit_predict(X)\n",
211
+ "print(\"Cluster assignments:\")\n",
212
+ "print(fake_df[['title', 'cluster']].head())"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 13,
218
+ "metadata": {},
219
+ "outputs": [
220
+ {
221
+ "data": {
222
+ "image/png": "",
223
+ "text/plain": [
224
+ "<Figure size 640x480 with 1 Axes>"
225
+ ]
226
+ },
227
+ "metadata": {},
228
+ "output_type": "display_data"
229
+ }
230
+ ],
231
+ "source": [
232
+ "# Visualizing the clusters\n",
233
+ "sns.countplot(x=fake_df['cluster'])\n",
234
+ "plt.title(\"Fake News Clustering\")\n",
235
+ "plt.show()"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 17,
241
+ "metadata": {},
242
+ "outputs": [],
243
+ "source": [
244
+ "# Apply LDA for topic modeling\n",
245
+ "num_topics = 5\n",
246
+ "lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)\n",
247
+ "topic_matrix = lda.fit_transform(X)\n"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 18,
253
+ "metadata": {},
254
+ "outputs": [
255
+ {
256
+ "name": "stdout",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "Topic 0: republican said vote state president people republicans would obama trump\n",
260
+ "Topic 1: students us school gun people video muslim said black police\n",
261
+ "Topic 2: one said like people clinton president video donald hillary trump\n",
262
+ "Topic 3: judge maxine jeanine nancy bundy flint waters video moore pelosi\n",
263
+ "Topic 4: investigation intelligence comey us hillary russian fbi russia clinton trump\n"
264
+ ]
265
+ }
266
+ ],
267
+ "source": [
268
+ "# Show top words for each topic\n",
269
+ "words = np.array(vectorizer.get_feature_names_out())\n",
270
+ "top_words = []\n",
271
+ "for topic_idx, topic in enumerate(lda.components_):\n",
272
+ " top_words.append(\" \".join(words[np.argsort(topic)][-10:]))\n",
273
+ " print(f\"Topic {topic_idx}: {top_words[-1]}\")"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 21,
279
+ "metadata": {},
280
+ "outputs": [
281
+ {
282
+ "data": {
283
+ "text/plain": [
284
+ "['tfidf_vectorizer.pkl']"
285
+ ]
286
+ },
287
+ "execution_count": 21,
288
+ "metadata": {},
289
+ "output_type": "execute_result"
290
+ }
291
+ ],
292
+ "source": [
293
+ "# Save model and vectorizer\n",
294
+ "joblib.dump(kmeans, \"kmeans_fake_news.pkl\")\n",
295
+ "joblib.dump(lda, \"lda_fake_news.pkl\")\n",
296
+ "joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n"
297
+ ]
298
+ }
299
+ ],
300
+ "metadata": {
301
+ "kernelspec": {
302
+ "display_name": "Python 3",
303
+ "language": "python",
304
+ "name": "python3"
305
+ },
306
+ "language_info": {
307
+ "codemirror_mode": {
308
+ "name": "ipython",
309
+ "version": 3
310
+ },
311
+ "file_extension": ".py",
312
+ "mimetype": "text/x-python",
313
+ "name": "python",
314
+ "nbconvert_exporter": "python",
315
+ "pygments_lexer": "ipython3",
316
+ "version": "3.11.9"
317
+ }
318
+ },
319
+ "nbformat": 4,
320
+ "nbformat_minor": 2
321
+ }