Spaces:
Sleeping
Sleeping
Updated assessment and solutions
Browse files- notebooks/assesment.ipynb +31 -15
- notebooks/solutions.ipynb +16 -1
notebooks/assesment.ipynb
CHANGED
@@ -87,8 +87,8 @@
|
|
87 |
"# spark_df = ?\n",
|
88 |
"\n",
|
89 |
"# Check schema and row count\n",
|
90 |
-
"# spark_df.
|
91 |
-
"# print(\"spark_df count:\", spark_df.
|
92 |
]
|
93 |
},
|
94 |
{
|
@@ -99,19 +99,19 @@
|
|
99 |
"source": [
|
100 |
"# 3. Split Data into Two Subsets for Merging/Joining\n",
|
101 |
"# ==================================================\n",
|
102 |
-
"#
|
103 |
-
"#
|
104 |
"# df_part1: subset of columns -> PassengerId, Name, Sex, Age\n",
|
105 |
"# df_part2: subset of columns -> PassengerId, Fare, Survived, Pclass\n",
|
106 |
"#\n",
|
107 |
-
"#
|
108 |
"\n",
|
109 |
"# 3.1 Pandas Split\n",
|
110 |
"# ----------------\n",
|
111 |
"\n",
|
112 |
"# TODO: Create two new DataFrames from pd_df:\n",
|
113 |
"# pd_part1 = pd_df[[\"PassengerId\", \"Name\", \"Sex\", \"Age\"]]\n",
|
114 |
-
"# pd_part2 = pd_df[
|
115 |
"\n",
|
116 |
"# pd_part1 = ?\n",
|
117 |
"# pd_part2 = ?\n",
|
@@ -129,8 +129,8 @@
|
|
129 |
"# 3.2 Spark Split\n",
|
130 |
"# ---------------\n",
|
131 |
"# TODO: Create two new DataFrames from spark_df:\n",
|
132 |
-
"# spark_part1 = spark_df.
|
133 |
-
"# spark_part2 = spark_df.
|
134 |
"\n",
|
135 |
"# spark_part1 = ?\n",
|
136 |
"# spark_part2 = ?\n",
|
@@ -153,7 +153,6 @@
|
|
153 |
"# TODO: Merge pd_part1 and pd_part2 on \"PassengerId\"\n",
|
154 |
"# We'll call the merged DataFrame \"pd_merged\".\n",
|
155 |
"#\n",
|
156 |
-
"# pd_merged = pd_part1.merge(pd_part2, on=\"PassengerId\", how=\"inner\")\n",
|
157 |
"\n",
|
158 |
"# pd_merged = ?\n",
|
159 |
"# print(\"pd_merged shape:\", pd_merged.shape)\n",
|
@@ -171,8 +170,9 @@
|
|
171 |
"# TODO: Join spark_part1 with spark_part2 on \"PassengerId\"\n",
|
172 |
"# We'll call the joined DataFrame \"spark_merged\".\n",
|
173 |
"#\n",
|
174 |
-
"# spark_merged = spark_part1.join(spark_part2, on=\"PassengerId\", how=\"inner\")\n",
|
175 |
"\n",
|
|
|
|
|
176 |
"# spark_merged = ?\n",
|
177 |
"# print(\"spark_merged count:\", spark_merged.count())\n",
|
178 |
"# spark_merged.show(5)\n",
|
@@ -242,7 +242,7 @@
|
|
242 |
"outputs": [],
|
243 |
"source": [
|
244 |
"# 6.2 TODO: Spark - Survival rate by Sex and Pclass\n",
|
245 |
-
"#
|
246 |
"#\n",
|
247 |
"# spark_survival_rate = ?\n",
|
248 |
"# spark_survival_rate.show()\n"
|
@@ -261,11 +261,20 @@
|
|
261 |
"# 7.1 TODO: Write spark_merged_clean to Parquet\n",
|
262 |
"# e.g., spark_merged_clean.write. ...\n",
|
263 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
"# 7.2 TODO: Read it back into a new Spark DataFrame called 'spark_parquet_df'\n",
|
265 |
"# spark_parquet_df = ?\n",
|
266 |
"\n",
|
267 |
"# print(\"spark_parquet_df count:\", spark_parquet_df.count())\n",
|
268 |
-
"# spark_parquet_df.show(5)
|
269 |
]
|
270 |
},
|
271 |
{
|
@@ -274,14 +283,21 @@
|
|
274 |
"metadata": {},
|
275 |
"outputs": [],
|
276 |
"source": [
|
277 |
-
"# 8.
|
278 |
"# ========================================\n",
|
279 |
"# 8.1 TODO: Create a temp view with 'spark_merged_clean' (e.g. \"titanic_merged\")\n",
|
280 |
"# spark_merged_clean.createOrReplaceTempView(\"titanic_merged\")\n",
|
281 |
"\n",
|
282 |
-
"# 8.2 TODO: Spark SQL query
|
|
|
|
|
283 |
"# result_df = spark.sql(\"SELECT ... FROM titanic_merged GROUP BY ...\")\n",
|
284 |
-
"# result_df.show()\n"
|
|
|
|
|
|
|
|
|
|
|
285 |
]
|
286 |
},
|
287 |
{
|
|
|
87 |
"# spark_df = ?\n",
|
88 |
"\n",
|
89 |
"# Check schema and row count\n",
|
90 |
+
"# spark_df. ...\n",
|
91 |
+
"# print(\"spark_df count:\", spark_df. ...)\n"
|
92 |
]
|
93 |
},
|
94 |
{
|
|
|
99 |
"source": [
|
100 |
"# 3. Split Data into Two Subsets for Merging/Joining\n",
|
101 |
"# ==================================================\n",
|
102 |
+
"# Split the dataset into two df's by column, then merge them \n",
|
103 |
+
"# back together\n",
|
104 |
"# df_part1: subset of columns -> PassengerId, Name, Sex, Age\n",
|
105 |
"# df_part2: subset of columns -> PassengerId, Fare, Survived, Pclass\n",
|
106 |
"#\n",
|
107 |
+
"# \n",
|
108 |
"\n",
|
109 |
"# 3.1 Pandas Split\n",
|
110 |
"# ----------------\n",
|
111 |
"\n",
|
112 |
"# TODO: Create two new DataFrames from pd_df:\n",
|
113 |
"# pd_part1 = pd_df[[\"PassengerId\", \"Name\", \"Sex\", \"Age\"]]\n",
|
114 |
+
"# pd_part2 = pd_df[...]\n",
|
115 |
"\n",
|
116 |
"# pd_part1 = ?\n",
|
117 |
"# pd_part2 = ?\n",
|
|
|
129 |
"# 3.2 Spark Split\n",
|
130 |
"# ---------------\n",
|
131 |
"# TODO: Create two new DataFrames from spark_df:\n",
|
132 |
+
"# spark_part1 = spark_df. ...\n",
|
133 |
+
"# spark_part2 = spark_df. ...\n",
|
134 |
"\n",
|
135 |
"# spark_part1 = ?\n",
|
136 |
"# spark_part2 = ?\n",
|
|
|
153 |
"# TODO: Merge pd_part1 and pd_part2 on \"PassengerId\"\n",
|
154 |
"# We'll call the merged DataFrame \"pd_merged\".\n",
|
155 |
"#\n",
|
|
|
156 |
"\n",
|
157 |
"# pd_merged = ?\n",
|
158 |
"# print(\"pd_merged shape:\", pd_merged.shape)\n",
|
|
|
170 |
"# TODO: Join spark_part1 with spark_part2 on \"PassengerId\"\n",
|
171 |
"# We'll call the joined DataFrame \"spark_merged\".\n",
|
172 |
"#\n",
|
|
|
173 |
"\n",
|
174 |
+
"\n",
|
175 |
+
"#Uncomment below\n",
|
176 |
"# spark_merged = ?\n",
|
177 |
"# print(\"spark_merged count:\", spark_merged.count())\n",
|
178 |
"# spark_merged.show(5)\n",
|
|
|
242 |
"outputs": [],
|
243 |
"source": [
|
244 |
"# 6.2 TODO: Spark - Survival rate by Sex and Pclass\n",
|
245 |
+
"# Average survival rate by Sex and Pclass\n",
|
246 |
"#\n",
|
247 |
"# spark_survival_rate = ?\n",
|
248 |
"# spark_survival_rate.show()\n"
|
|
|
261 |
"# 7.1 TODO: Write spark_merged_clean to Parquet\n",
|
262 |
"# e.g., spark_merged_clean.write. ...\n",
|
263 |
"\n",
|
264 |
+
"\n"
|
265 |
+
]
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"cell_type": "code",
|
269 |
+
"execution_count": null,
|
270 |
+
"metadata": {},
|
271 |
+
"outputs": [],
|
272 |
+
"source": [
|
273 |
"# 7.2 TODO: Read it back into a new Spark DataFrame called 'spark_parquet_df'\n",
|
274 |
"# spark_parquet_df = ?\n",
|
275 |
"\n",
|
276 |
"# print(\"spark_parquet_df count:\", spark_parquet_df.count())\n",
|
277 |
+
"# spark_parquet_df.show(5)"
|
278 |
]
|
279 |
},
|
280 |
{
|
|
|
283 |
"metadata": {},
|
284 |
"outputs": [],
|
285 |
"source": [
|
286 |
+
"# 8. Create a Temp View and Query\n",
|
287 |
"# ========================================\n",
|
288 |
"# 8.1 TODO: Create a temp view with 'spark_merged_clean' (e.g. \"titanic_merged\")\n",
|
289 |
"# spark_merged_clean.createOrReplaceTempView(\"titanic_merged\")\n",
|
290 |
"\n",
|
291 |
+
"# 8.2 TODO: Spark SQL query examples\n",
|
292 |
+
"\n",
|
293 |
+
"#Get the average passenger age grouped by PClass\n",
|
294 |
"# result_df = spark.sql(\"SELECT ... FROM titanic_merged GROUP BY ...\")\n",
|
295 |
+
"# result_df.show()\n",
|
296 |
+
"\n",
|
297 |
+
"# Calculate the Pearson correlation between passenger Fare and Survival\n",
|
298 |
+
"# using either SQL or another method\n",
|
299 |
+
"# Corr.(X, Y) = cov(X,Y)/(std(X)*std(Y))\n",
|
300 |
+
"# corr = ..."
|
301 |
]
|
302 |
},
|
303 |
{
|
notebooks/solutions.ipynb
CHANGED
@@ -256,7 +256,22 @@
|
|
256 |
" GROUP BY Pclass\n",
|
257 |
" ORDER BY Pclass\n",
|
258 |
" \"\"\")\n",
|
259 |
-
"result_df.show()\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
]
|
261 |
},
|
262 |
{
|
|
|
256 |
" GROUP BY Pclass\n",
|
257 |
" ORDER BY Pclass\n",
|
258 |
" \"\"\")\n",
|
259 |
+
"result_df.show()\n",
|
260 |
+
"\n",
|
261 |
+
"#Correlation between Fare and Survival\n",
|
262 |
+
"# Compute the Pearson correlation between Fare and Survived\n",
|
263 |
+
"\n",
|
264 |
+
"correlation1 = spark_merged_clean.stat.corr(\"Fare\", \"Survived\", \"pearson\")\n",
|
265 |
+
"\n",
|
266 |
+
"print(\"Pearson correlation between Fare and Survived:\", correlation1)\n",
|
267 |
+
"\n",
|
268 |
+
"correlation2 = spark.sql(\n",
|
269 |
+
" '''\n",
|
270 |
+
" SELECT\n",
|
271 |
+
" covar_samp(Fare, Survived) / (stddev_samp(Fare)*stddev_samp(Survived)) as correlation\n",
|
272 |
+
" FROM titanic_merged_clean\n",
|
273 |
+
" '''\n",
|
274 |
+
")\n"
|
275 |
]
|
276 |
},
|
277 |
{
|