Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- .gitattributes +1 -0
- Fake.csv +3 -0
- app.py +139 -0
- kmeans_fake_news.pkl +3 -0
- lda_fake_news.pkl +3 -0
- tfidf_vectorizer.pkl +3 -0
- train.ipynb +321 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Fake.csv filter=lfs diff=lfs merge=lfs -text
|
Fake.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bebf8bcfe95678bf2c732bf413a2ce5f621af0102c82bf08083b2e5d3c693d0c
|
3 |
+
size 62789876
|
app.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
import joblib
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import seaborn as sns
|
9 |
+
from wordcloud import WordCloud
|
10 |
+
import nltk
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
from sklearn.metrics import silhouette_score
|
13 |
+
|
14 |
+
# Download stopwords if not available
|
15 |
+
nltk.download("stopwords")
|
16 |
+
|
17 |
+
# Load models and vectorizer
|
18 |
+
kmeans = joblib.load("kmeans_fake_news.pkl")
|
19 |
+
lda = joblib.load("lda_fake_news.pkl")
|
20 |
+
vectorizer = joblib.load("tfidf_vectorizer.pkl")
|
21 |
+
|
22 |
+
# Load dataset
|
23 |
+
DATASET_URL = "https://www.kaggle.com/datasets/mrisdal/fake-news"
|
24 |
+
fake_df = pd.read_csv("Fake.csv")
|
25 |
+
|
26 |
+
# Preprocessing
|
27 |
+
stop_words = set(stopwords.words("english"))
|
28 |
+
|
29 |
+
def clean_text(text):
|
30 |
+
"""Cleans the input text by removing punctuation, numbers, and stopwords."""
|
31 |
+
text = text.lower()
|
32 |
+
text = re.sub(f"[{string.punctuation}]", "", text) # Remove punctuation
|
33 |
+
text = re.sub(r"\d+", "", text) # Remove numbers
|
34 |
+
text = " ".join([word for word in text.split() if word not in stop_words]) # Remove stopwords
|
35 |
+
return text
|
36 |
+
|
37 |
+
fake_df = fake_df[['title', 'text']].dropna()
|
38 |
+
fake_df['content'] = fake_df['title'] + " " + fake_df['text']
|
39 |
+
fake_df['clean_text'] = fake_df['content'].apply(clean_text)
|
40 |
+
|
41 |
+
# Transform text into TF-IDF features
|
42 |
+
X = vectorizer.transform(fake_df['clean_text'])
|
43 |
+
fake_df['cluster'] = kmeans.predict(X)
|
44 |
+
|
45 |
+
# Get top words for LDA topics
|
46 |
+
words = np.array(vectorizer.get_feature_names_out())
|
47 |
+
top_words = [" ".join(words[np.argsort(topic)][-10:]) for topic in lda.components_]
|
48 |
+
|
49 |
+
# Sidebar Navigation
|
50 |
+
st.sidebar.title("Navigation")
|
51 |
+
page = st.sidebar.radio("Go to", ["Dataset", "Visualizations", "Model Info", "Model Metrics", "Predictor"])
|
52 |
+
|
53 |
+
# Model Information Page
|
54 |
+
if page == "Model Info":
|
55 |
+
st.title("Model Information")
|
56 |
+
|
57 |
+
st.write("### Machine Learning Models Used")
|
58 |
+
st.markdown(
|
59 |
+
"""
|
60 |
+
- **K-Means Clustering**: Used to group fake news articles into clusters based on their content similarity.
|
61 |
+
- **Latent Dirichlet Allocation (LDA)**: Used for topic modeling to extract the main topics from fake news articles.
|
62 |
+
- **TF-IDF Vectorizer**: Transforms the textual content into numerical features to be used by the models.
|
63 |
+
"""
|
64 |
+
)
|
65 |
+
|
66 |
+
# Dataset Page
|
67 |
+
elif page == "Dataset":
|
68 |
+
st.title("Fake News Topic Analyzer")
|
69 |
+
|
70 |
+
st.write("### About the Dataset")
|
71 |
+
st.markdown(
|
72 |
+
"""
|
73 |
+
The dataset contains **fake news articles** collected from multiple sources.
|
74 |
+
It includes titles, article texts, and publishing dates.
|
75 |
+
We use this dataset for **unsupervised clustering and topic modeling**.
|
76 |
+
"""
|
77 |
+
)
|
78 |
+
st.write(f"📂 **Dataset Source:** [Kaggle: Fake News](<{DATASET_URL}>)")
|
79 |
+
|
80 |
+
st.write("### Sample Data (Raw)")
|
81 |
+
st.dataframe(fake_df[['title', 'text']].head())
|
82 |
+
|
83 |
+
st.write("### Sample Data (Cleaned)")
|
84 |
+
st.dataframe(fake_df[['clean_text']].head())
|
85 |
+
|
86 |
+
st.write("### Word Cloud of Most Frequent Words")
|
87 |
+
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(fake_df['clean_text']))
|
88 |
+
fig, ax = plt.subplots()
|
89 |
+
ax.imshow(wordcloud, interpolation="bilinear")
|
90 |
+
ax.axis("off")
|
91 |
+
st.pyplot(fig)
|
92 |
+
|
93 |
+
# Visualizations Page
|
94 |
+
elif page == "Visualizations":
|
95 |
+
st.title("Fake News Clustering & Topic Modeling")
|
96 |
+
|
97 |
+
st.write("### Cluster Distribution")
|
98 |
+
fig, ax = plt.subplots()
|
99 |
+
sns.countplot(x=fake_df['cluster'], ax=ax, palette="viridis")
|
100 |
+
ax.set_xlabel("Cluster")
|
101 |
+
ax.set_ylabel("Number of Articles")
|
102 |
+
st.pyplot(fig)
|
103 |
+
|
104 |
+
st.write("### Topic Words from LDA")
|
105 |
+
for idx, words in enumerate(top_words):
|
106 |
+
st.write(f"**Topic {idx}:** {words}")
|
107 |
+
|
108 |
+
# Model Metrics Page
|
109 |
+
elif page == "Model Metrics":
|
110 |
+
st.title("Model Clustering Performance")
|
111 |
+
|
112 |
+
sil_score = silhouette_score(X, fake_df['cluster'])
|
113 |
+
st.write(f"### Silhouette Score (K-Means Clustering): **{sil_score:.4f}**")
|
114 |
+
|
115 |
+
st.write("### Sample Articles per Cluster")
|
116 |
+
for cluster_id in sorted(fake_df['cluster'].unique()):
|
117 |
+
st.write(f"#### Cluster {cluster_id} Samples")
|
118 |
+
st.dataframe(fake_df[fake_df['cluster'] == cluster_id][['title', 'text']].head(3))
|
119 |
+
|
120 |
+
# Predictor Page
|
121 |
+
elif page == "Predictor":
|
122 |
+
st.title("Fake News Topic Analyzer")
|
123 |
+
|
124 |
+
user_input = st.text_area("Enter news content:")
|
125 |
+
|
126 |
+
if st.button("Analyze"):
|
127 |
+
if user_input.strip():
|
128 |
+
cleaned_input = clean_text(user_input)
|
129 |
+
vectorized_input = vectorizer.transform([cleaned_input])
|
130 |
+
cluster_pred = kmeans.predict(vectorized_input)[0]
|
131 |
+
topic_pred = np.argmax(lda.transform(vectorized_input))
|
132 |
+
|
133 |
+
st.write(f"### Predicted Cluster: {cluster_pred}")
|
134 |
+
|
135 |
+
# Handle out-of-range topic index
|
136 |
+
if topic_pred < len(top_words):
|
137 |
+
st.write(f"### Predicted Topic: {topic_pred} - {top_words[topic_pred]}")
|
138 |
+
else:
|
139 |
+
st.write(f"### Predicted Topic: {topic_pred} (No keywords available)")
|
kmeans_fake_news.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a02ff155e5f58800f8bba6f72c45b8e92f6fa35f4ee314f5fe813a705e34c2b4
|
3 |
+
size 214667
|
lda_fake_news.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ab2298fb0f693a053fd395d2baba1d4ad30c191a2a0293338a1471860177551
|
3 |
+
size 406309
|
tfidf_vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b15c1cf1ed716832967a7484647e290a03a20a7e99e1a91686d237437bdd2e75
|
3 |
+
size 184732
|
train.ipynb
ADDED
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 20,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import numpy as np\n",
|
11 |
+
"import re\n",
|
12 |
+
"import string\n",
|
13 |
+
"import nltk\n",
|
14 |
+
"from nltk.corpus import stopwords\n",
|
15 |
+
"from sklearn.decomposition import LatentDirichletAllocation\n",
|
16 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
17 |
+
"from sklearn.cluster import KMeans\n",
|
18 |
+
"import matplotlib.pyplot as plt\n",
|
19 |
+
"import seaborn as sns\n",
|
20 |
+
"import joblib\n"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [
|
28 |
+
{
|
29 |
+
"name": "stderr",
|
30 |
+
"output_type": "stream",
|
31 |
+
"text": [
|
32 |
+
"[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n",
|
33 |
+
"[nltk_data] Jr\\AppData\\Roaming\\nltk_data...\n",
|
34 |
+
"[nltk_data] Unzipping corpora\\stopwords.zip.\n"
|
35 |
+
]
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"source": [
|
39 |
+
"# Download NLTK stopwords\n",
|
40 |
+
"nltk.download('stopwords')\n",
|
41 |
+
"stop_words = set(stopwords.words('english'))"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 4,
|
47 |
+
"metadata": {},
|
48 |
+
"outputs": [],
|
49 |
+
"source": [
|
50 |
+
"# Load dataset\n",
|
51 |
+
"fake_df = pd.read_csv(\"Fake.csv\")"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": 6,
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [
|
59 |
+
{
|
60 |
+
"name": "stdout",
|
61 |
+
"output_type": "stream",
|
62 |
+
"text": [
|
63 |
+
"Initial Data:\n",
|
64 |
+
" title \\\n",
|
65 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... \n",
|
66 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... \n",
|
67 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... \n",
|
68 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
|
69 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... \n",
|
70 |
+
"\n",
|
71 |
+
" text \n",
|
72 |
+
"0 Donald Trump just couldn t wish all Americans ... \n",
|
73 |
+
"1 House Intelligence Committee Chairman Devin Nu... \n",
|
74 |
+
"2 On Friday, it was revealed that former Milwauk... \n",
|
75 |
+
"3 On Christmas day, Donald Trump announced that ... \n",
|
76 |
+
"4 Pope Francis used his annual Christmas Day mes... \n"
|
77 |
+
]
|
78 |
+
}
|
79 |
+
],
|
80 |
+
"source": [
|
81 |
+
"print(\"Initial Data:\")\n",
|
82 |
+
"print(fake_df.head())"
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "code",
|
87 |
+
"execution_count": 7,
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [
|
90 |
+
{
|
91 |
+
"name": "stdout",
|
92 |
+
"output_type": "stream",
|
93 |
+
"text": [
|
94 |
+
"Data after dropping missing values:\n",
|
95 |
+
" title \\\n",
|
96 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... \n",
|
97 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... \n",
|
98 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... \n",
|
99 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
|
100 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... \n",
|
101 |
+
"\n",
|
102 |
+
" text \n",
|
103 |
+
"0 Donald Trump just couldn t wish all Americans ... \n",
|
104 |
+
"1 House Intelligence Committee Chairman Devin Nu... \n",
|
105 |
+
"2 On Friday, it was revealed that former Milwauk... \n",
|
106 |
+
"3 On Christmas day, Donald Trump announced that ... \n",
|
107 |
+
"4 Pope Francis used his annual Christmas Day mes... \n"
|
108 |
+
]
|
109 |
+
}
|
110 |
+
],
|
111 |
+
"source": [
|
112 |
+
"# Keep only relevant columns\n",
|
113 |
+
"fake_df = fake_df[['title', 'text']].dropna()\n",
|
114 |
+
"print(\"Data after dropping missing values:\")\n",
|
115 |
+
"print(fake_df.head())\n"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 8,
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"# Combine title and text\n",
|
125 |
+
"fake_df['content'] = fake_df['title'] + \" \" + fake_df['text']"
|
126 |
+
]
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"cell_type": "code",
|
130 |
+
"execution_count": 9,
|
131 |
+
"metadata": {},
|
132 |
+
"outputs": [],
|
133 |
+
"source": [
|
134 |
+
"# Function to clean text\n",
|
135 |
+
"def clean_text(text):\n",
|
136 |
+
" text = text.lower()\n",
|
137 |
+
" text = re.sub(f\"[{string.punctuation}]\", \"\", text)\n",
|
138 |
+
" text = re.sub(r\"\\d+\", \"\", text)\n",
|
139 |
+
" text = \" \".join([word for word in text.split() if word not in stop_words])\n",
|
140 |
+
" return text"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "code",
|
145 |
+
"execution_count": 10,
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [
|
148 |
+
{
|
149 |
+
"name": "stdout",
|
150 |
+
"output_type": "stream",
|
151 |
+
"text": [
|
152 |
+
"Data after text cleaning:\n",
|
153 |
+
" content \\\n",
|
154 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... \n",
|
155 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... \n",
|
156 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... \n",
|
157 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
|
158 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... \n",
|
159 |
+
"\n",
|
160 |
+
" clean_text \n",
|
161 |
+
"0 donald trump sends embarrassing new year’s eve... \n",
|
162 |
+
"1 drunk bragging trump staffer started russian c... \n",
|
163 |
+
"2 sheriff david clarke becomes internet joke thr... \n",
|
164 |
+
"3 trump obsessed even obama’s name coded website... \n",
|
165 |
+
"4 pope francis called donald trump christmas spe... \n"
|
166 |
+
]
|
167 |
+
}
|
168 |
+
],
|
169 |
+
"source": [
|
170 |
+
"# Apply text cleaning\n",
|
171 |
+
"fake_df['clean_text'] = fake_df['content'].apply(clean_text)\n",
|
172 |
+
"print(\"Data after text cleaning:\")\n",
|
173 |
+
"print(fake_df[['content', 'clean_text']].head())"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 11,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [],
|
181 |
+
"source": [
|
182 |
+
"# Convert text to TF-IDF vectors\n",
|
183 |
+
"vectorizer = TfidfVectorizer(max_features=5000)\n",
|
184 |
+
"X = vectorizer.fit_transform(fake_df['clean_text'])"
|
185 |
+
]
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"cell_type": "code",
|
189 |
+
"execution_count": 12,
|
190 |
+
"metadata": {},
|
191 |
+
"outputs": [
|
192 |
+
{
|
193 |
+
"name": "stdout",
|
194 |
+
"output_type": "stream",
|
195 |
+
"text": [
|
196 |
+
"Cluster assignments:\n",
|
197 |
+
" title cluster\n",
|
198 |
+
"0 Donald Trump Sends Out Embarrassing New Year’... 2\n",
|
199 |
+
"1 Drunk Bragging Trump Staffer Started Russian ... 2\n",
|
200 |
+
"2 Sheriff David Clarke Becomes An Internet Joke... 1\n",
|
201 |
+
"3 Trump Is So Obsessed He Even Has Obama’s Name... 2\n",
|
202 |
+
"4 Pope Francis Just Called Out Donald Trump Dur... 1\n"
|
203 |
+
]
|
204 |
+
}
|
205 |
+
],
|
206 |
+
"source": [
|
207 |
+
"# Apply K-Means clustering\n",
|
208 |
+
"num_clusters = 3 # Try clustering articles into 3 groups\n",
|
209 |
+
"kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n",
|
210 |
+
"fake_df['cluster'] = kmeans.fit_predict(X)\n",
|
211 |
+
"print(\"Cluster assignments:\")\n",
|
212 |
+
"print(fake_df[['title', 'cluster']].head())"
|
213 |
+
]
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"cell_type": "code",
|
217 |
+
"execution_count": 13,
|
218 |
+
"metadata": {},
|
219 |
+
"outputs": [
|
220 |
+
{
|
221 |
+
"data": {
|
222 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjEsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvc2/+5QAAAAlwSFlzAAAPYQAAD2EBqD+naQAANllJREFUeJzt3Qt8znX/x/HPZrY5tM1oG4XcKadETaFQsj/irpQODmV3iQ4oh5wK0WnhlkNEKqei5K6pkGiUYk5DzkulKNkqTE4zXP/H5/u4f9fjurbhyz2ua9dez8fj17Xf7/e9ftf32q6H69339AtyuVwuAQAAwBkFn/k0AAAAFKEJAADAAqEJAADAAqEJAADAAqEJAADAAqEJAADAAqEJAADAAqEJAADAAqEJAADAAqEJgJdp06ZJUFCQrF271tdVCTi33nqr2Qqjr776ynwu9BEoqghNQICFnfy2AQMGiD/517/+Zep17bXXSn53ctJz3bt3l8IiIyNDnnnmGalevbqULFlSSpUqJfHx8fLSSy/JgQMHLlo9XnnlFZk7d+5Fez2gqAnxdQUAFKwXXnhBqlSp4nXsmmuuEX+0adMm+fjjj6Vt27ZSWK1Zs0ZatWolhw4dkgcffNCEJaUtda+++qosW7ZMFi1adNFC07333itt2rQp8Gs3adJEjh49KqGhoQV+baCwIDQBAeb222+XevXqib8rUaKEVKxY0YS8e+65x7QuFTbainT33XdLsWLFZP369aalydPLL78sb731lhRmx44dM0EpODhYwsPDfV0dwKfongOKiF9++UWefPJJqVatmgksZcuWlfvuu09+/vnnsz53//79cuONN8rll18u6enp5lh2drY8//zzUrVqVQkLCzMBqF+/fua4Df0SHjRokGzcuFGSk5PPWt7m9TR8XX/99V7Pu+OOO0wg+/TTT93HVq1aZY59/vnnZj8nJ0eGDRsmV111lQkG+rtp1KiRLF68+Ix1evPNN+W3336T1157LU9gUrGxseY9nq1LNfffIL/xQzt27DAtcnFxcaaO+rdo166dZGVlmfNa/vDhwzJ9+nR3t6x2gzq0no888oipk/7+atWqJVOmTMn3dT/44ANT78suu8x0Nx48eDDfOun4LG3F3Lp1qzRt2tSU1eeMGDEi38/fnXfeabouY2JipFevXvLFF18wTgqFCi1NQIDRL9E///zT61i5cuVMN9KKFSvMF61+4eoX9cSJE80Xn37p6RdefvRa//d//yf79u2Tr7/+Wq688ko5deqU+QL89ttvpWvXrlKjRg3T1TZ69Gj5/vvvrcfVdOjQQV588UXT2qQtNqdrbbJ9vcaNG8snn3xivuQjIiLMeKnly5ebgPbNN9+Yayj9WY/dfPPNZn/o0KGSlJQkjz76qAmH+nztXlu3bp1576ejQUwDqHaJXUjHjx+XFi1amIDYo0cPE5w0BM2bN8+0dkVGRsq7777rrr/+jpT+rZwxVw0aNHCPFbv00ktNYOzcubN5rz179vR6Pf2baOuSjtPS1zxTl5wG6pYtW5rAev/998t//vMf6d+/v9SuXdu0eioNc7fddpv8/vvv8vTTT5v6z5o1S5YuXXpBf29AgXMBCAhTp07VEdX5burIkSN5npOammrOz5gxI8911qxZ4/r9999dtWrVcv3jH/9w/fzzz+4y7777ris4ONj1zTffeF1v0qRJ5rnLly8/Y10TExNdpUqVMj9Pnz7dPOfjjz92n9f9bt26nfPraZ11f8GCBWZ/48aNZv++++5z1a9f3/28O++803Xddde59+vUqeNq3bq161yVKVPGPNfWLbfcYrbcv+udO3d6lVu6dKk5ro9q/fr1Zn/OnDlnvL7+TvV3m1vnzp1d5cuXd/35559ex9u1a+eKjIx0fzac19W/d+7PS+46Oe8n9+cnOzvbFRcX52rbtq372KhRo0y5uXPnuo8dPXrUVb169TzXBPwZ3XNAgJkwYYLpVvLclLaIOLQ76q+//jJdXVFRUaZFJbdff/1VbrnlFlNWBzNXrlzZfW7OnDmmtUe7pLQlytm0NUGdSwtCx44dTbeYtjblN5PuXF7vuuuuk9KlS5v6Oi1K2qrWqVMn8x6PHDliXkNbrLRVyqG/gy1btpgusHOhrTSXXHKJXGjakqS0O0vfw7nQ9/vRRx+Zbkr92fP3p61X2jKZ+++fmJjo9Xk5E/196wB4h7ZKaWvXTz/95D62cOFC023ntPQp7WLs0qXLOb0XwNfongMCjH5h5TcQXGc+aRfU1KlTTdeOZ0BxxsV4euihhyQkJES2bdtmulM8abjQ49rNk5/MzEzr+uogah0/o1/U2s2m3XS52b6eXqthw4YmLCl91HCk45NOnjwpK1euNGN6tKvRMzRpYLvrrrvk6quvNmN0tLtJ378uiXAm2gX4999/y4WmsyF79+5txk7NnDnT1F0DiIYVJ1Cdzh9//GG68CZPnmw2m79X7tmXZ6KhNHe3apkyZcxYNc/xTNpVmLuchnagMCE0AUWEjoXRwKTjVzRY6JetfonpGCcdM5SbjlGZMWOGjB071oQtT1pex6zol3h+dJD2udDWJmdsU37T5c/l9TQg6aw1nfWloem5554zLUkahnRfQ5PyDE06nf7HH38046F0eYC3337bjJeaNGmSGSd0OtrytWHDBjPm6Hym4p9uDJcGvNxGjRplBnY7dXzqqafM30WDoAaX03H+thqwNJjmJ3c4tG1lcoJqfk7XaggUZoQmoIjQAbr6palfvg4NFqdbfFFDlrYEDBkyxAQszwUytdXgu+++k2bNmhXIUgFOa5MTCnI7l9fTMKQh5v333zctak440mDkhCZtUXLCkyM6Oloefvhhs+maS1peB4ifKTRpl1dqaqrp/mrfvv05v29tkVG5/wbaMpMfDY666e9KB/XrQHYNdrqIpsrvd6Otc9qFqEEsISFBfEG7dnWygQYpzzr+8MMPPqkPcL4Y0wQUERpMcv/f/+uvv55vq4Zj8ODBZgbVwIEDzUw7h86S0kCS3xpE2g2os6XOlbaEaEjTqf+5ncvr1a9fX4oXLy7Dhw83QUin1isNT9oqozMAPVuZlI7vyj1OR+tytuUTHn/8cSlfvrz06dPHzOLLr9vLCTT5cWa3OWOwlP49cnej6dipEydOeB3T8KQzAD3rqNP5cwcw/bvrUgUa7DZv3pxv992FpmOn9O/nueyDBvbCvoYVih5amoAi4p///KeZlq6tRjVr1jQtJF9++aVZk+hMRo4cacY8devWzbRYaLjR8T4ffvihCQ06CFtbPPTLfvv27ea4Dlg+1wU29ctdu9K0pSe3c3k9XTpBV+XWgOSs0aS05UjDlW65Q5P+PnTpBX2eBi1dbkBb5s52KxdtKdI1pnRF8Lp163qtCK6Dq7W1S7tCT0cDnS4FoKFUx1npa+saSbkD0pIlS0xddF0tbSXT8/q3dAKRQ19b/6bajVmhQgUzNklDpK5Mrr83/VkHX+v71dfTOmp5/flCeuyxx2T8+PGmNU6XHNCgqWOznMUyC+PCpiiifD19D0DB8FwqID/79+93Pfzww65y5cq5Spcu7WrRooVr+/btrsqVK3tNU8/vOidPnnS1b9/eFRIS4p42fvz4cdfw4cPNkgRhYWFm+n18fLxr2LBhrqysLOslBzzl5OS4rrzyyjxLDpzr6/Xt29dcQ8t7qlq1qjn+448/eh1/6aWXXDfeeKMrKirKVaJECTMV/uWXXzavaWPPnj2uXr16ua6++mpXeHi4q2TJkqZueg3PuuVeckBpXRISEsx7io2NdT377LOuxYsXe03F/+mnn1yPPPKI+d3o9aOjo11NmzZ1ffnll17X0r9nkyZNzHvQ53v+XTMyMszvtGLFiq7ixYubZQGaNWvmmjx5cp5lBfJb2uB0Sw7o3yM3fV39XHnS96DLOmjdLr30UlefPn1cH330kbnmypUrrX7PgK8F6X98HdwAAEXPmDFjzMrguryFLkkA+DtCEwDggtOxZ56z8nRMk66rpd2s+Y0HA/wRY5oAABecLmFRqVIlM/ZLx8i99957Zkyajm0CCgtCEwDggtMZdLr+lYYkbV3Sweg66P2BBx7wddUAa3TPAQAAWGCdJgAAAAuEJgAAAAuMaSogen+nPXv2mMX/WKgNAIDCQUcp6Y23dUFYXWX/TAhNBUQD07nepBQAAPiH3bt3n/Hm14rQVEC0hcn5pUdERPi6OgAAwILe21EbPZzv8TMhNBUQp0tOAxOhCQCAwsVmaA0DwQEAACz4NDQtW7bM3IVcB19pwps7d+5py+rdzbWM3qvIk96du2PHjqZ1JyoqSjp37iyHDh3yKrNx40ZzV3O9o7Y2wY0YMSLP9efMmSPVq1c3ZWrXri0LFiwowHcKAAAKO5+GpsOHD0udOnVkwoQJZyyXnJwsK1euNOEqNw1MW7ZskcWLF8u8efNMEOvatatXX2Xz5s2lcuXKkpaWJiNHjpShQ4fK5MmT3WVWrFgh7du3N4Fr/fr10qZNG7Nt3ry5gN8xAAAotFx+QquSnJyc5/ivv/7quuyyy1ybN292Va5c2TV69Gj3ua1bt5rnrVmzxn3s888/dwUFBbl+++03s//GG2+4ypQp48rOznaX6d+/v6tatWru/fvvv9/VunVrr9etX7++67HHHrOuf1ZWlqmLPgIAgMLhXL6/g/197aOHHnpI+vbtK7Vq1cpzPjU11XTJ1atXz30sISHBrLOwatUqd5kmTZpIaGio1z2Q0tPTZf/+/e4y+jxPWkaPn052drZpxfLcAABA4PLr0DR8+HAJCQmRp556Kt/ze/fulZiYGK9jWj46Otqcc8rExsZ6lXH2z1bGOZ+fpKQkiYyMdG+s0QQAQGDz29Ck44/Gjh0r06ZN88sVtgcOHChZWVnuTddnAgAAgctvQ9M333wjmZmZUqlSJdN6pNsvv/wiffr0kSuuuMKUiYuLM2U8nThxwsyo03NOmYyMDK8yzv7Zyjjn8xMWFuZek4m1mQAACHx+G5p0LJMuFbBhwwb3prPndHzTF198Yco0bNhQDhw4YFqlHEuWLDFjoerXr+8uozPqcnJy3GV0pl21atWkTJky7jIpKSler69l9DgAAIDPVwTX9ZR++OEH9/7OnTtNONIxSdrCVLZsWa/yxYsXN60/GnhUjRo1pGXLltKlSxeZNGmSCUbdu3eXdu3auZcn6NChgwwbNswsJ9C/f3+zjIB2+40ePdp93aefflpuueUWGTVqlLRu3Vo++OADWbt2rdeyBAAAoIhz+dDSpUvNNL/cW2JiYr7lcy85oP766y9X+/btXaVLl3ZFRES4Hn74Ydfff//tVea7775zNWrUyBUWFmaWL3j11VfzXPvDDz90XX311a7Q0FBXrVq1XPPnzz+n98KSAwAAFD7n8v0dpP/xdXALBLrkgM6i00HhjG8CACDwvr/9dkwTAACAPyE0AQAAWCA0AQAA+PvsOQD+L77vDF9XAX4kbWQnX1cB8BlamgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAACwQmgAAAPw9NC1btkzuuOMOqVChggQFBcncuXPd53JycqR///5Su3ZtKVWqlCnTqVMn2bNnj9c19u3bJx07dpSIiAiJioqSzp07y6FDh7zKbNy4URo3bizh4eFSsWJFGTFiRJ66zJkzR6pXr27K6GsuWLDgAr5zAABQ2Pg0NB0+fFjq1KkjEyZMyHPuyJEjsm7dOhk8eLB5/PjjjyU9PV3uvPNOr3IamLZs2SKLFy+WefPmmSDWtWtX9/mDBw9K8+bNpXLlypKWliYjR46UoUOHyuTJk91lVqxYIe3btzeBa/369dKmTRuzbd68+QL/BgAAQGER5HK5XOIHtKUpOTnZhJXTWbNmjdx4443yyy+/SKVKlWTbtm1Ss2ZNc7xevXqmzMKFC6VVq1by66+/mtapiRMnynPPPSd79+6V0NBQU2bAgAGmVWv79u1m/4EHHjABTkOXo0GDBlK3bl2ZNGmSVf01nEVGRkpWVpZp9QICRXzfGb6uAvxI2shOvq4CUKDO5fu7UI1p0jek4Uq74VRqaqr52QlMKiEhQYKDg2XVqlXuMk2aNHEHJtWiRQvTarV//353GX2eJy2jx08nOzvb/KI9NwAAELgKTWg6duyYGeOk3WhOEtTWo5iYGK9yISEhEh0dbc45ZWJjY73KOPtnK+Ocz09SUpJJps6mY6UAAEDgKhShSQeF33///aI9idrd5g8GDhxoWr6cbffu3b6uEgAAuIBCpJAEJh3HtGTJEq/+xri4OMnMzPQqf+LECTOjTs85ZTIyMrzKOPtnK+Ocz09YWJjZAABA0RBcGALTjh075Msvv5SyZct6nW/YsKEcOHDAzIpzaLA6deqU1K9f311GZ9TptRw6065atWpSpkwZd5mUlBSva2sZPQ4AAODz0KTrKW3YsMFsaufOnebnXbt2mZBz7733ytq1a2XmzJly8uRJM8ZIt+PHj5vyNWrUkJYtW0qXLl1k9erVsnz5cunevbu0a9fOzJxTHTp0MIPAdTkBXZpg9uzZMnbsWOndu7e7Hk8//bSZdTdq1Cgzo06XJNDX1WsBAAD4fMmBr776Spo2bZrneGJiogkuVapUyfd5S5culVtvvdX8rF1xGm4+++wzM2uubdu2Mm7cOCldurTX4pbdunUzSxOUK1dOevToYQaV517cctCgQfLzzz/LVVddZRbA1KULbLHkAAIVSw7AE0sOINCcy/e336zTVNgRmhCoCE3wRGhCoAnYdZoAAAB8hdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAABggdAEAADg76Fp2bJlcscdd0iFChUkKChI5s6d63Xe5XLJkCFDpHz58lKiRAlJSEiQHTt2eJXZt2+fdOzYUSIiIiQqKko6d+4shw4d8iqzceNGady4sYSHh0vFihVlxIgReeoyZ84cqV69uilTu3ZtWbBgwQV61wAAoDDyaWg6fPiw1KlTRyZMmJDveQ0348aNk0mTJsmqVaukVKlS0qJFCzl27Ji7jAamLVu2yOLFi2XevHkmiHXt2tV9/uDBg9K8eXOpXLmypKWlyciRI2Xo0KEyefJkd5kVK1ZI+/btTeBav369tGnTxmybN2++wL8BAABQWAS5tDnHD2hLU3JysgkrSqulLVB9+vSRZ555xhzLysqS2NhYmTZtmrRr1062bdsmNWvWlDVr1ki9evVMmYULF0qrVq3k119/Nc+fOHGiPPfcc7J3714JDQ01ZQYMGGBatbZv3272H3jgARPgNHQ5GjRoIHXr1jWBzYaGs8jISFNHbfUCAkV83xm+rgL8SNrITr6uAlCgzuX722/HNO3cudMEHe2Sc+ibql+/vqSmppp9fdQuOScwKS0fHBxsWqacMk2aNHEHJqWtVenp6bJ//353Gc/Xcco4rwMAABAifkoDk9KWJU+675zTx5iYGK/zISEhEh0d7VWmSpUqea7hnCtTpox5PNPr5Cc7O9tsnkkVAAAELr9tafJ3SUlJpuXL2XSAOQAACFx+G5ri4uLMY0ZGhtdx3XfO6WNmZqbX+RMnTpgZdZ5l8ruG52ucroxzPj8DBw40/Z/Otnv37v/h3QIAAH/nt6FJu9Q0tKSkpHh1gelYpYYNG5p9fTxw4ICZFedYsmSJnDp1yox9csrojLqcnBx3GZ1pV61aNdM155TxfB2njPM6+QkLCzMDxjw3AAAQuHwamnQ9pQ0bNpjNGfytP+/atcvMpuvZs6e89NJL8umnn8qmTZukU6dOZkacM8OuRo0a0rJlS+nSpYusXr1ali9fLt27dzcz67Sc6tChgxkErssJ6NIEs2fPlrFjx0rv3r3d9Xj66afNrLtRo0aZGXW6JMHatWvNtQAAAHw+EFyDSdOmTd37TpBJTEw0ywr069fPLAWg6y5pi1KjRo1MuNEFKB0zZ8404aZZs2Zm1lzbtm3N2k4OHW+0aNEi6datm8THx0u5cuXMgpmeaznddNNNMmvWLBk0aJA8++yzctVVV5klCa655pqL9rsAAAD+zW/WaSrsWKcJgYp1muCJdZoQaAJinSYAAAB/QmgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAACwQGgCAAAo7KHp5MmTMnjwYKlSpYqUKFFCrrzySnnxxRfF5XK5y+jPQ4YMkfLly5syCQkJsmPHDq/r7Nu3Tzp27CgRERESFRUlnTt3lkOHDnmV2bhxozRu3FjCw8OlYsWKMmLEiIv2PgEAgP/z69A0fPhwmThxoowfP162bdtm9jXMvP766+4yuj9u3DiZNGmSrFq1SkqVKiUtWrSQY8eOuctoYNqyZYssXrxY5s2bJ8uWLZOuXbu6zx88eFCaN28ulStXlrS0NBk5cqQMHTpUJk+efNHfMwAA8E8h4sdWrFghd911l7Ru3drsX3HFFfL+++/L6tWr3a1MY8aMkUGDBplyasaMGRIbGytz586Vdu3ambC1cOFCWbNmjdSrV8+U0dDVqlUr+fe//y0VKlSQmTNnyvHjx2XKlCkSGhoqtWrVkg0bNshrr73mFa4AAEDR5dctTTfddJOkpKTI999/b/a/++47+fbbb+X22283+zt37pS9e/eaLjlHZGSk1K9fX1JTU82+PmqXnBOYlJYPDg42LVNOmSZNmpjA5NDWqvT0dNm/f3++dcvOzjYtVJ4bAAAIXH7d0jRgwAATRqpXry7FihUzY5xefvll092mNDApbVnypPvOOX2MiYnxOh8SEiLR0dFeZXTcVO5rOOfKlCmTp25JSUkybNiwAn2/AADAf/l1S9OHH35ous5mzZol69atk+nTp5suNX30tYEDB0pWVpZ72717t6+rBAAAimpLU9++fU1rk45NUrVr15ZffvnFtPIkJiZKXFycOZ6RkWFmzzl0v27duuZnLZOZmel13RMnTpgZdc7z9VGf48nZd8rkFhYWZjYAAFA0+HVL05EjR8zYI0/aTXfq1Cnzs3apaajRcU8O7c7TsUoNGzY0+/p44MABMyvOsWTJEnMNHfvklNEZdTk5Oe4yOtOuWrVq+XbNAQCAosevQ9Mdd9xhxjDNnz9ffv75Z0lOTjYz2u6++25zPigoSHr27CkvvfSSfPrpp7Jp0ybp1KmTmRHXpk0bU6ZGjRrSsmVL6dKli5l1t3z5cunevbtpvdJyqkOHDmYQuK7fpEsTzJ49W8aOHSu9e/f26fsHAACFPDTddtttpvUmN23l0XMFRZcGuPfee+XJJ5804eeZZ56Rxx57zCxw6ejXr5/06NHDLA1www03mEUrdYkBXaTSoeOidDB5s2bNzFIDjRo18lqDSWfcLVq0yMzGi4+Plz59+pgFM1luAAAAOIJcnstrW9Ius/xmpenYocsuu8yrm6uo0MCo4UsHhevK40CgiO87w9dVgB9JG9nJ11UAfPb9fU4DwfVWI46tW7e6p+wrXQ5AW3g0NAEAAASacwpNOiNNxxHpll83nN77zfMWJwAAAEUyNOmYH+3N+8c//mEGVV966aXuczqQWrvrdHYbAABAkQ5NekNb5Uz5BwAAKCrOe3HLHTt2yNKlS83g79whSmeeAQAASFEPTW+99ZY88cQTUq5cObO4pI5xcujPhCYAABBozis06WKSuuhk//79C75GAAAAgbK45f79++W+++4r+NoAAAAEUmjSwKQraAMAABQV59U9V7VqVRk8eLCsXLlSateuLcWLF/c6/9RTTxVU/QAAAApvaNL7tpUuXVq+/vprs3nSgeCEJgAAEGjOKzTpIpcAAABFyXmv0wQAgC9wE2n46kbS5xWaHnnkkTOenzJlyvnWBwAAwC+FnO+SA55ycnJk8+bNcuDAgXxv5AsAAFAkQ1NycnKeY3orFV0l/MorryyIegEAABT+dZryvVBwsPTu3VtGjx5dUJcEAAAIvNCkfvzxRzlx4kRBXhIAAKDwds9pi5Inl8slv//+u8yfP18SExMLqm4AAACFOzStX78+T9fcpZdeKqNGjTrrzDoAAIAiE5qWLl1a8DUBAAAI1MUt//jjD0lPTzc/V6tWzbQ2AQAABKLzGgh++PBh0w1Xvnx5adKkidkqVKggnTt3liNHjhR8LQEAAApjaNKB4Hqj3s8++8wsaKnbJ598Yo716dOn4GsJAABQGLvnPvroI/nPf/4jt956q/tYq1atpESJEnL//ffLxIkTC7KOAAAAhbOlSbvgYmNj8xyPiYmhew4AAASk8wpNDRs2lOeff16OHTvmPnb06FEZNmyYOQcAABBozqt7bsyYMdKyZUu5/PLLpU6dOubYd999J2FhYbJo0aKCriMAAEDhDE21a9eWHTt2yMyZM2X79u3mWPv27aVjx45mXBMAAECgOa/QlJSUZMY0denSxev4lClTzNpN/fv3L6j6AQAAFN4xTW+++aZUr149z/FatWrJpEmTCqJeAAAAhT807d271yxsmZuuCK437gUAAAg05xWaKlasKMuXL89zXI/pyuAAAACB5rzGNOlYpp49e0pOTo7cdttt5lhKSor069ePFcEBAEBAOq/Q1LdvX/nrr7/kySeflOPHj5tj4eHhZgD4wIEDC7qOAAAAhTM0BQUFyfDhw2Xw4MGybds2s8zAVVddZdZpAgAACETnFZocpUuXlhtuuKHgagMAABBIA8EBAACKGkITAACABUITAACABUITAABAIISm3377TR588EEpW7asmaWnNwteu3at+7zL5ZIhQ4aYFcr1fEJCgrmZsKd9+/aZmwlHRERIVFSUdO7cWQ4dOuRVZuPGjdK4cWOzdIIu3jlixIiL9h4BAID/8+vQtH//frn55pulePHi8vnnn8vWrVtl1KhRUqZMGXcZDTfjxo0z97xbtWqVlCpVSlq0aCHHjh1zl9HAtGXLFlm8eLHMmzdPli1bJl27dnWfP3jwoDRv3lwqV64saWlpMnLkSBk6dKhMnjz5or9nAAAQgEsOXGi6FpS2+kydOtV9rEqVKl6tTGPGjJFBgwbJXXfdZY7NmDFDYmNjZe7cudKuXTuzjtTChQtlzZo1Uq9ePVPm9ddfl1atWsm///1vc9uXmTNnmkU6p0yZIqGhoebGwxs2bJDXXnvNK1wBAICiy69bmj799FMTdO677z6JiYmR6667Tt566y33+Z07d5qbB2uXnCMyMlLq168vqampZl8ftUvOCUxKywcHB5uWKadMkyZNTGByaGtVenq6ae3KT3Z2tmmh8twAAEDg8uvQ9NNPP8nEiRPNauNffPGFPPHEE/LUU0/J9OnTzXkNTEpbljzpvnNOHzVweQoJCZHo6GivMvldw/M1cktKSjIBzdm0RQwAAAQuvw5Np06dkuuvv15eeeUV08qkXWV6s2Adv+Rreo+9rKws97Z7925fVwkAABTV0KQz4mrWrOl1rEaNGrJr1y7zc1xcnHnMyMjwKqP7zjl9zMzM9Dp/4sQJM6POs0x+1/B8jdz0Pns6G89zAwAAgcuvQ5POnNNxRZ6+//57M8vNGRSuoSYlJcV9XscW6Vilhg0bmn19PHDggJkV51iyZIlpxdKxT04ZnVGXk5PjLqMz7apVq+Y1Uw8AABRdfh2aevXqJStXrjTdcz/88IPMmjXLLAPQrVs3cz4oKEh69uwpL730khk0vmnTJunUqZOZEdemTRt3y1TLli1Nt97q1atl+fLl0r17dzOzTsupDh06mEHgun6TLk0we/ZsGTt2rPTu3dun7x8AAPgPv15y4IYbbpDk5GQzfuiFF14wLUu6xICuu+To16+fHD582Ix30halRo0amSUGdJFKhy4poEGpWbNmZtZc27ZtzdpODh3IvWjRIhPG4uPjpVy5cmbBTJYbAAAAjiCXLnaE/5l2C2r40kHhjG9CIInvO8PXVYAfSRvZyddV4DOJAv1cnsv3t193zwEAAPgLQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAIAFQhMAAECghaZXX31VgoKCpGfPnu5jx44dk27duknZsmWldOnS0rZtW8nIyPB63q5du6R169ZSsmRJiYmJkb59+8qJEye8ynz11Vdy/fXXS1hYmFStWlWmTZt20d4XAADwf4UmNK1Zs0befPNNufbaa72O9+rVSz777DOZM2eOfP3117Jnzx6555573OdPnjxpAtPx48dlxYoVMn36dBOIhgwZ4i6zc+dOU6Zp06ayYcMGE8oeffRR+eKLLy7qewQAAP6rUISmQ4cOSceOHeWtt96SMmXKuI9nZWXJO++8I6+99prcdtttEh8fL1OnTjXhaOXKlabMokWLZOvWrfLee+9J3bp15fbbb5cXX3xRJkyYYIKUmjRpklSpUkVGjRolNWrUkO7du8u9994ro0eP9tl7BgAA/qVQhCbtftOWoISEBK/jaWlpkpOT43W8evXqUqlSJUlNTTX7+li7dm2JjY11l2nRooUcPHhQtmzZ4i6T+9paxrlGfrKzs801PDcAABC4QsTPffDBB7Ju3TrTPZfb3r17JTQ0VKKioryOa0DSc04Zz8DknHfOnamMBqGjR49KiRIl8rx2UlKSDBs2rADeIQAAKAz8uqVp9+7d8vTTT8vMmTMlPDxc/MnAgQNN96CzaV0BAEDg8uvQpN1vmZmZZlZbSEiI2XSw97hx48zP2hqk45IOHDjg9TydPRcXF2d+1sfcs+mc/bOViYiIyLeVSeksOz3vuQEAgMDl16GpWbNmsmnTJjOjzdnq1atnBoU7PxcvXlxSUlLcz0lPTzdLDDRs2NDs66NeQ8OXY/HixSbk1KxZ013G8xpOGecaAAAAfj2m6ZJLLpFrrrnG61ipUqXMmkzO8c6dO0vv3r0lOjraBKEePXqYsNOgQQNzvnnz5iYcPfTQQzJixAgzfmnQoEFmcLm2FqnHH39cxo8fL/369ZNHHnlElixZIh9++KHMnz/fB+8aAAD4I78OTTZ0WYDg4GCzqKXOaNNZb2+88Yb7fLFixWTevHnyxBNPmDCloSsxMVFeeOEFdxldbkADkq75NHbsWLn88svl7bffNtcCAABQQS6Xy8Wv4n+nM+0iIyPNoHDGNyGQxPed4esqwI+kjezk6yrwmUSBfi7P5fvbr8c0AQAA+AtCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgAVCEwAAgIUQm0K4eOL7zvB1FeBH0kZ28nUVAAD/RUsTAACABUITAACABUITAACABUITAACABUITAACABUITAACABUITAACABUITAACABUITAACABUITAABAYQ9NSUlJcsMNN8gll1wiMTEx0qZNG0lPT/cqc+zYMenWrZuULVtWSpcuLW3btpWMjAyvMrt27ZLWrVtLyZIlzXX69u0rJ06c8Crz1VdfyfXXXy9hYWFStWpVmTZt2kV5jwAAoHDw69D09ddfm0C0cuVKWbx4seTk5Ejz5s3l8OHD7jK9evWSzz77TObMmWPK79mzR+655x73+ZMnT5rAdPz4cVmxYoVMnz7dBKIhQ4a4y+zcudOUadq0qWzYsEF69uwpjz76qHzxxRcX/T0DAAD/5Nc37F24cKHXvoYdbSlKS0uTJk2aSFZWlrzzzjsya9Ysue2220yZqVOnSo0aNUzQatCggSxatEi2bt0qX375pcTGxkrdunXlxRdflP79+8vQoUMlNDRUJk2aJFWqVJFRo0aZa+jzv/32Wxk9erS0aNHCJ+8dAAD4F79uacpNQ5KKjo42jxqetPUpISHBXaZ69epSqVIlSU1NNfv6WLt2bROYHBqEDh48KFu2bHGX8byGU8a5Rn6ys7PNNTw3AAAQuApNaDp16pTpNrv55pvlmmuuMcf27t1rWoqioqK8ympA0nNOGc/A5Jx3zp2pjAaho0ePnna8VWRkpHurWLFiAb5bAADgbwpNaNKxTZs3b5YPPvhA/MHAgQNNy5ez7d6929dVAgAARXVMk6N79+4yb948WbZsmVx++eXu43FxcWaA94EDB7xam3T2nJ5zyqxevdrres7sOs8yuWfc6X5ERISUKFEi3zrpLDvdAABA0eDXLU0ul8sEpuTkZFmyZIkZrO0pPj5eihcvLikpKe5juiSBLjHQsGFDs6+PmzZtkszMTHcZnYmngahmzZruMp7XcMo41wAAAAjx9y45nRn3ySefmLWanDFIOoZIW4D0sXPnztK7d28zOFyDUI8ePUzY0ZlzSpco0HD00EMPyYgRI8w1Bg0aZK7ttBQ9/vjjMn78eOnXr5888sgjJqB9+OGHMn/+fJ++fwAA4D/8uqVp4sSJZrzQrbfeKuXLl3dvs2fPdpfRZQH++c9/mkUtdRkC7Wr7+OOP3eeLFStmuvb0UcPUgw8+KJ06dZIXXnjBXUZbsDQgaetSnTp1zNIDb7/9NssNAACAwtHSpN1zZxMeHi4TJkww2+lUrlxZFixYcMbraDBbv379edUTAAAEPr9uaQIAAPAXhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhCYAAAALhKZcJkyYIFdccYWEh4dL/fr1ZfXq1b6uEgAA8AOEJg+zZ8+W3r17y/PPPy/r1q2TOnXqSIsWLSQzM9PXVQMAAD5GaPLw2muvSZcuXeThhx+WmjVryqRJk6RkyZIyZcoUX1cNAAD4GKHpv44fPy5paWmSkJDgPhYcHGz2U1NTfVo3AADgeyG+roC/+PPPP+XkyZMSGxvrdVz3t2/fnqd8dna22RxZWVnm8eDBg/9TPU5mH/2fno/A8r9+ngoCn0l44jOJQPtcOs91uVxnLUtoOk9JSUkybNiwPMcrVqzok/ogMEW+/rivqwB44TOJQP1c/v333xIZGXnGMoSm/ypXrpwUK1ZMMjIyvI7rflxcXJ7yAwcONIPGHadOnZJ9+/ZJ2bJlJSgo6KLUOVBp6tfwuXv3bomIiPB1dQA+k/A7fCYLjrYwaWCqUKHCWcsSmv4rNDRU4uPjJSUlRdq0aeMOQrrfvXv3POXDwsLM5ikqKuqi1bco0H8I+McA/oTPJPwNn8mCcbYWJgehyYO2HCUmJkq9evXkxhtvlDFjxsjhw4fNbDoAAFC0EZo8PPDAA/LHH3/IkCFDZO/evVK3bl1ZuHBhnsHhAACg6CE05aJdcfl1x+Hi0W5PXWA0d/cn4Ct8JuFv+Ez6RpDLZo4dAABAEcfilgAAABYITQAAABYITQAAABYITQAAABYITfArEyZMkCuuuELCw8Olfv36snr1al9XCUXYsmXL5I477jArBetK/3PnzvV1lVDE6S28brjhBrnkkkskJibGLMacnp7u62oVGYQm+I3Zs2ebBUZ1Gu26deukTp060qJFC8nMzPR11VBE6eK2+jnUMA/4g6+//lq6desmK1eulMWLF0tOTo40b97cfFZx4bHkAPyGtizp/0GNHz/efRsbvbdSjx49ZMCAAb6uHoo4bWlKTk5232YJ8Ae6ILO2OGmYatKkia+rE/BoaYJfOH78uKSlpUlCQoL7WHBwsNlPTU31ad0AwF9lZWWZx+joaF9XpUggNMEv/Pnnn3Ly5Mk8t6zRfb2lDQDAm7bG9+zZU26++Wa55pprfF2dIoHbqAAAUAjp2KbNmzfLt99+6+uqFBmEJviFcuXKSbFixSQjI8PruO7HxcX5rF4A4I/0Hqnz5s0zMzwvv/xyX1enyKB7Dn4hNDRU4uPjJSUlxavpWfcbNmzo07oBgL/QuVsamHRSwpIlS6RKlSq+rlKRQksT/IYuN5CYmCj16tWTG2+8UcaMGWOm0T788MO+rhqKqEOHDskPP/zg3t+5c6ds2LDBDLqtVKmST+uGotslN2vWLPnkk0/MWk3OmM/IyEgpUaKEr6sX8FhyAH5FlxsYOXKk+Yegbt26Mm7cOLMUAeALX331lTRt2jTPcQ3306ZN80mdULTp0hf5mTp1qvzrX/+66PUpaghNAAAAFhjTBAAAYIHQBAAAYIHQBAAAYIHQBAAAYIHQBAAAYIHQBAAAYIHQBAAAYIHQBKBI+Pnnn83CgLqiNwCcD0ITAJwHXRE8KirK19UAcBERmgDAh06ePGluTg3A/xGaAAQUDSAjRoyQqlWrSlhYmLmx7ssvv2zVUjR37lyve3t999135t5zemPUiIgIiY+Pl7Vr15p70umNpLOyskx53YYOHWqek52dLc8884xcdtllUqpUKXPvRC2f+3U//fRTqVmzpqnjrl27LujvBEDBCCmg6wCAXxg4cKC89dZbMnr0aGnUqJH8/vvvsn379vO6VseOHeW6666TiRMnSrFixcx4qOLFi8tNN90kY8aMkSFDhkh6eropW7p0afPYvXt32bp1q3zwwQdSoUIFSU5OlpYtW8qmTZvkqquuMmWOHDkiw4cPl7ffflvKli0rMTExBfgbAHChEJoABIy///5bxo4dK+PHj5fExERz7MorrzThSQeCnyttAerbt69Ur17d7DuhR0VGRpoWpri4OK/yerd5fdTApLTVaeHCheb4K6+8Yo7l5OTIG2+8IXXq1Pmf3zOAi4fQBCBgbNu2zXSPNWvWrECu17t3b3n00Ufl3XfflYSEBLnvvvtMCDsdbU3SMUpXX32113Gtk7YoOUJDQ+Xaa68tkDoCuHgITQACRokSJazLBgcHi8vl8jqmLUCedJxShw4dZP78+fL555/L888/b7rd7r777nyveejQIdONl5aWZh49Od13Tj09x04BKBwYCA4gYGj3mQaSlJSUs5a99NJLTXfe4cOH3cfyW8NJW4169eolixYtknvuucd0szmtRdqq5EnHP+mxzMxMMxDdc/PsxgNQOBGaAASM8PBw6d+/v/Tr109mzJghP/74o6xcuVLeeeedPGV1VlvJkiXl2WefNeVmzZplZrY5jh49agZ168y3X375RZYvXy5r1qyRGjVqmPNXXHGFaVnSgPbnn3+awd0asHTweKdOneTjjz+WnTt3yurVqyUpKcm0VgEo3AhNAALK4MGDpU+fPmZmmwacBx54wLT85BYdHS3vvfeeLFiwQGrXri3vv/++e9kApd1rf/31lwlAGobuv/9+uf3222XYsGHmvM6ge/zxx831tdVKlzlQ2hKlz9E6VKtWTdq0aWPCli59AKBwC3Ll7tQHAABAHrQ0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAWCA0AQAAyNn9PwIZ7a4pM7oeAAAAAElFTkSuQmCC",
|
223 |
+
"text/plain": [
|
224 |
+
"<Figure size 640x480 with 1 Axes>"
|
225 |
+
]
|
226 |
+
},
|
227 |
+
"metadata": {},
|
228 |
+
"output_type": "display_data"
|
229 |
+
}
|
230 |
+
],
|
231 |
+
"source": [
|
232 |
+
"# Visualizing the clusters\n",
|
233 |
+
"sns.countplot(x=fake_df['cluster'])\n",
|
234 |
+
"plt.title(\"Fake News Clustering\")\n",
|
235 |
+
"plt.show()"
|
236 |
+
]
|
237 |
+
},
|
238 |
+
{
|
239 |
+
"cell_type": "code",
|
240 |
+
"execution_count": 17,
|
241 |
+
"metadata": {},
|
242 |
+
"outputs": [],
|
243 |
+
"source": [
|
244 |
+
"# Apply LDA for topic modeling\n",
|
245 |
+
"num_topics = 5\n",
|
246 |
+
"lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)\n",
|
247 |
+
"topic_matrix = lda.fit_transform(X)\n"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"cell_type": "code",
|
252 |
+
"execution_count": 18,
|
253 |
+
"metadata": {},
|
254 |
+
"outputs": [
|
255 |
+
{
|
256 |
+
"name": "stdout",
|
257 |
+
"output_type": "stream",
|
258 |
+
"text": [
|
259 |
+
"Topic 0: republican said vote state president people republicans would obama trump\n",
|
260 |
+
"Topic 1: students us school gun people video muslim said black police\n",
|
261 |
+
"Topic 2: one said like people clinton president video donald hillary trump\n",
|
262 |
+
"Topic 3: judge maxine jeanine nancy bundy flint waters video moore pelosi\n",
|
263 |
+
"Topic 4: investigation intelligence comey us hillary russian fbi russia clinton trump\n"
|
264 |
+
]
|
265 |
+
}
|
266 |
+
],
|
267 |
+
"source": [
|
268 |
+
"# Show top words for each topic\n",
|
269 |
+
"words = np.array(vectorizer.get_feature_names_out())\n",
|
270 |
+
"top_words = []\n",
|
271 |
+
"for topic_idx, topic in enumerate(lda.components_):\n",
|
272 |
+
" top_words.append(\" \".join(words[np.argsort(topic)][-10:]))\n",
|
273 |
+
" print(f\"Topic {topic_idx}: {top_words[-1]}\")"
|
274 |
+
]
|
275 |
+
},
|
276 |
+
{
|
277 |
+
"cell_type": "code",
|
278 |
+
"execution_count": 21,
|
279 |
+
"metadata": {},
|
280 |
+
"outputs": [
|
281 |
+
{
|
282 |
+
"data": {
|
283 |
+
"text/plain": [
|
284 |
+
"['tfidf_vectorizer.pkl']"
|
285 |
+
]
|
286 |
+
},
|
287 |
+
"execution_count": 21,
|
288 |
+
"metadata": {},
|
289 |
+
"output_type": "execute_result"
|
290 |
+
}
|
291 |
+
],
|
292 |
+
"source": [
|
293 |
+
"# Save model and vectorizer\n",
|
294 |
+
"joblib.dump(kmeans, \"kmeans_fake_news.pkl\")\n",
|
295 |
+
"joblib.dump(lda, \"lda_fake_news.pkl\")\n",
|
296 |
+
"joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n"
|
297 |
+
]
|
298 |
+
}
|
299 |
+
],
|
300 |
+
"metadata": {
|
301 |
+
"kernelspec": {
|
302 |
+
"display_name": "Python 3",
|
303 |
+
"language": "python",
|
304 |
+
"name": "python3"
|
305 |
+
},
|
306 |
+
"language_info": {
|
307 |
+
"codemirror_mode": {
|
308 |
+
"name": "ipython",
|
309 |
+
"version": 3
|
310 |
+
},
|
311 |
+
"file_extension": ".py",
|
312 |
+
"mimetype": "text/x-python",
|
313 |
+
"name": "python",
|
314 |
+
"nbconvert_exporter": "python",
|
315 |
+
"pygments_lexer": "ipython3",
|
316 |
+
"version": "3.11.9"
|
317 |
+
}
|
318 |
+
},
|
319 |
+
"nbformat": 4,
|
320 |
+
"nbformat_minor": 2
|
321 |
+
}
|