deagar commited on
Commit
8e5d803
·
1 Parent(s): c02076c

Updated assessment and solutions

Browse files
notebooks/assesment.ipynb CHANGED
@@ -87,8 +87,8 @@
87
  "# spark_df = ?\n",
88
  "\n",
89
  "# Check schema and row count\n",
90
- "# spark_df.printSchema()\n",
91
- "# print(\"spark_df count:\", spark_df.count())\n"
92
  ]
93
  },
94
  {
@@ -99,19 +99,19 @@
99
  "source": [
100
  "# 3. Split Data into Two Subsets for Merging/Joining\n",
101
  "# ==================================================\n",
102
- "# Instead of using a second CSV, we'll simulate it by splitting the original dataset\n",
103
- "# into two DataFrames:\n",
104
  "# df_part1: subset of columns -> PassengerId, Name, Sex, Age\n",
105
  "# df_part2: subset of columns -> PassengerId, Fare, Survived, Pclass\n",
106
  "#\n",
107
- "# We then merge these two separate DataFrames in both Pandas and Spark.\n",
108
  "\n",
109
  "# 3.1 Pandas Split\n",
110
  "# ----------------\n",
111
  "\n",
112
  "# TODO: Create two new DataFrames from pd_df:\n",
113
  "# pd_part1 = pd_df[[\"PassengerId\", \"Name\", \"Sex\", \"Age\"]]\n",
114
- "# pd_part2 = pd_df[[\"PassengerId\", \"Fare\", \"Survived\", \"Pclass\"]]\n",
115
  "\n",
116
  "# pd_part1 = ?\n",
117
  "# pd_part2 = ?\n",
@@ -129,8 +129,8 @@
129
  "# 3.2 Spark Split\n",
130
  "# ---------------\n",
131
  "# TODO: Create two new DataFrames from spark_df:\n",
132
- "# spark_part1 = spark_df.select(\"PassengerId\", \"Name\", \"Sex\", \"Age\")\n",
133
- "# spark_part2 = spark_df.select(\"PassengerId\", \"Fare\", \"Survived\", \"Pclass\")\n",
134
  "\n",
135
  "# spark_part1 = ?\n",
136
  "# spark_part2 = ?\n",
@@ -153,7 +153,6 @@
153
  "# TODO: Merge pd_part1 and pd_part2 on \"PassengerId\"\n",
154
  "# We'll call the merged DataFrame \"pd_merged\".\n",
155
  "#\n",
156
- "# pd_merged = pd_part1.merge(pd_part2, on=\"PassengerId\", how=\"inner\")\n",
157
  "\n",
158
  "# pd_merged = ?\n",
159
  "# print(\"pd_merged shape:\", pd_merged.shape)\n",
@@ -171,8 +170,9 @@
171
  "# TODO: Join spark_part1 with spark_part2 on \"PassengerId\"\n",
172
  "# We'll call the joined DataFrame \"spark_merged\".\n",
173
  "#\n",
174
- "# spark_merged = spark_part1.join(spark_part2, on=\"PassengerId\", how=\"inner\")\n",
175
  "\n",
 
 
176
  "# spark_merged = ?\n",
177
  "# print(\"spark_merged count:\", spark_merged.count())\n",
178
  "# spark_merged.show(5)\n",
@@ -242,7 +242,7 @@
242
  "outputs": [],
243
  "source": [
244
  "# 6.2 TODO: Spark - Survival rate by Sex and Pclass\n",
245
- "# e.g. groupBy(\"Sex\", \"Pclass\").agg(F.avg(\"Survived\"))\n",
246
  "#\n",
247
  "# spark_survival_rate = ?\n",
248
  "# spark_survival_rate.show()\n"
@@ -261,11 +261,20 @@
261
  "# 7.1 TODO: Write spark_merged_clean to Parquet\n",
262
  "# e.g., spark_merged_clean.write. ...\n",
263
  "\n",
 
 
 
 
 
 
 
 
 
264
  "# 7.2 TODO: Read it back into a new Spark DataFrame called 'spark_parquet_df'\n",
265
  "# spark_parquet_df = ?\n",
266
  "\n",
267
  "# print(\"spark_parquet_df count:\", spark_parquet_df.count())\n",
268
- "# spark_parquet_df.show(5)\n"
269
  ]
270
  },
271
  {
@@ -274,14 +283,21 @@
274
  "metadata": {},
275
  "outputs": [],
276
  "source": [
277
- "# 8. Bonus 1: Create a Temp View and Query\n",
278
  "# ========================================\n",
279
  "# 8.1 TODO: Create a temp view with 'spark_merged_clean' (e.g. \"titanic_merged\")\n",
280
  "# spark_merged_clean.createOrReplaceTempView(\"titanic_merged\")\n",
281
  "\n",
282
- "# 8.2 TODO: Spark SQL query example\n",
 
 
283
  "# result_df = spark.sql(\"SELECT ... FROM titanic_merged GROUP BY ...\")\n",
284
- "# result_df.show()\n"
 
 
 
 
 
285
  ]
286
  },
287
  {
 
87
  "# spark_df = ?\n",
88
  "\n",
89
  "# Check schema and row count\n",
90
+ "# spark_df. ...\n",
91
+ "# print(\"spark_df count:\", spark_df. ...)\n"
92
  ]
93
  },
94
  {
 
99
  "source": [
100
  "# 3. Split Data into Two Subsets for Merging/Joining\n",
101
  "# ==================================================\n",
102
+ "# Split the dataset into two df's by column, then merge them \n",
103
+ "# back together\n",
104
  "# df_part1: subset of columns -> PassengerId, Name, Sex, Age\n",
105
  "# df_part2: subset of columns -> PassengerId, Fare, Survived, Pclass\n",
106
  "#\n",
107
+ "# \n",
108
  "\n",
109
  "# 3.1 Pandas Split\n",
110
  "# ----------------\n",
111
  "\n",
112
  "# TODO: Create two new DataFrames from pd_df:\n",
113
  "# pd_part1 = pd_df[[\"PassengerId\", \"Name\", \"Sex\", \"Age\"]]\n",
114
+ "# pd_part2 = pd_df[...]\n",
115
  "\n",
116
  "# pd_part1 = ?\n",
117
  "# pd_part2 = ?\n",
 
129
  "# 3.2 Spark Split\n",
130
  "# ---------------\n",
131
  "# TODO: Create two new DataFrames from spark_df:\n",
132
+ "# spark_part1 = spark_df. ...\n",
133
+ "# spark_part2 = spark_df. ...\n",
134
  "\n",
135
  "# spark_part1 = ?\n",
136
  "# spark_part2 = ?\n",
 
153
  "# TODO: Merge pd_part1 and pd_part2 on \"PassengerId\"\n",
154
  "# We'll call the merged DataFrame \"pd_merged\".\n",
155
  "#\n",
 
156
  "\n",
157
  "# pd_merged = ?\n",
158
  "# print(\"pd_merged shape:\", pd_merged.shape)\n",
 
170
  "# TODO: Join spark_part1 with spark_part2 on \"PassengerId\"\n",
171
  "# We'll call the joined DataFrame \"spark_merged\".\n",
172
  "#\n",
 
173
  "\n",
174
+ "\n",
175
+ "#Uncomment below\n",
176
  "# spark_merged = ?\n",
177
  "# print(\"spark_merged count:\", spark_merged.count())\n",
178
  "# spark_merged.show(5)\n",
 
242
  "outputs": [],
243
  "source": [
244
  "# 6.2 TODO: Spark - Survival rate by Sex and Pclass\n",
245
+ "# Average survival rate by Sex and Pclass\n",
246
  "#\n",
247
  "# spark_survival_rate = ?\n",
248
  "# spark_survival_rate.show()\n"
 
261
  "# 7.1 TODO: Write spark_merged_clean to Parquet\n",
262
  "# e.g., spark_merged_clean.write. ...\n",
263
  "\n",
264
+ "\n"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
  "# 7.2 TODO: Read it back into a new Spark DataFrame called 'spark_parquet_df'\n",
274
  "# spark_parquet_df = ?\n",
275
  "\n",
276
  "# print(\"spark_parquet_df count:\", spark_parquet_df.count())\n",
277
+ "# spark_parquet_df.show(5)"
278
  ]
279
  },
280
  {
 
283
  "metadata": {},
284
  "outputs": [],
285
  "source": [
286
+ "# 8. Create a Temp View and Query\n",
287
  "# ========================================\n",
288
  "# 8.1 TODO: Create a temp view with 'spark_merged_clean' (e.g. \"titanic_merged\")\n",
289
  "# spark_merged_clean.createOrReplaceTempView(\"titanic_merged\")\n",
290
  "\n",
291
+ "# 8.2 TODO: Spark SQL query examples\n",
292
+ "\n",
293
+ "#Get the average passenger age grouped by PClass\n",
294
  "# result_df = spark.sql(\"SELECT ... FROM titanic_merged GROUP BY ...\")\n",
295
+ "# result_df.show()\n",
296
+ "\n",
297
+ "# Calculate the Pearson correlation between passenger Fare and Survival\n",
298
+ "# using either SQL or another method\n",
299
+ "# Corr.(X, Y) = cov(X,Y)/(std(X)*std(Y))\n",
300
+ "# corr = ..."
301
  ]
302
  },
303
  {
notebooks/solutions.ipynb CHANGED
@@ -256,7 +256,22 @@
256
  " GROUP BY Pclass\n",
257
  " ORDER BY Pclass\n",
258
  " \"\"\")\n",
259
- "result_df.show()\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  ]
261
  },
262
  {
 
256
  " GROUP BY Pclass\n",
257
  " ORDER BY Pclass\n",
258
  " \"\"\")\n",
259
+ "result_df.show()\n",
260
+ "\n",
261
+ "#Correlation between Fare and Survival\n",
262
+ "# Compute the Pearson correlation between Fare and Survived\n",
263
+ "\n",
264
+ "correlation1 = spark_merged_clean.stat.corr(\"Fare\", \"Survived\", \"pearson\")\n",
265
+ "\n",
266
+ "print(\"Pearson correlation between Fare and Survived:\", correlation1)\n",
267
+ "\n",
268
+ "correlation2 = spark.sql(\n",
269
+ " '''\n",
270
+ " SELECT\n",
271
+ " covar_samp(Fare, Survived) / (stddev_samp(Fare)*stddev_samp(Survived)) as correlation\n",
272
+ " FROM titanic_merged_clean\n",
273
+ " '''\n",
274
+ ")\n"
275
  ]
276
  },
277
  {