Spaces:
Running
Running
raymondEDS
commited on
Commit
·
dd039c2
1
Parent(s):
52cb672
Updating week_1 and week_3
Browse files- Reference files/Data_cleaning_lab.ipynb +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/components/__pycache__/login.cpython-311.pyc +0 -0
- app/components/login.py +2 -2
- app/main.py +4 -1
- app/pages/__pycache__/week_1.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_2.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_3.cpython-311.pyc +0 -0
- app/pages/week_1.py +112 -4
- app/pages/week_3.py +337 -0
- assets/Pictures/research_question.html +0 -0
- assets/Pictures/research_question.jpg +0 -0
Reference files/Data_cleaning_lab.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app/__pycache__/main.cpython-311.pyc
CHANGED
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
|
|
app/components/__pycache__/login.cpython-311.pyc
CHANGED
Binary files a/app/components/__pycache__/login.cpython-311.pyc and b/app/components/__pycache__/login.cpython-311.pyc differ
|
|
app/components/login.py
CHANGED
@@ -7,8 +7,8 @@ def login():
|
|
7 |
st.title("Login to Data Science Course App")
|
8 |
|
9 |
#usernames
|
10 |
-
usernames = ["admin", "student", "manxiii"]
|
11 |
-
passwords = ["admin", "123", "manxi123"]
|
12 |
|
13 |
# Create a form for login
|
14 |
with st.form("login_form"):
|
|
|
7 |
st.title("Login to Data Science Course App")
|
8 |
|
9 |
#usernames
|
10 |
+
usernames = ["admin", "student", "manxiii","zhu"]
|
11 |
+
passwords = ["admin", "123", "manxi123","zhu123"]
|
12 |
|
13 |
# Create a form for login
|
14 |
with st.form("login_form"):
|
app/main.py
CHANGED
@@ -15,6 +15,7 @@ from app.components.login import login
|
|
15 |
# Import week pages
|
16 |
from app.pages import week_1
|
17 |
from app.pages import week_2
|
|
|
18 |
|
19 |
# Page configuration
|
20 |
st.set_page_config(
|
@@ -136,6 +137,8 @@ def show_week_content():
|
|
136 |
week_1.show()
|
137 |
elif st.session_state.current_week == 2:
|
138 |
week_2.show()
|
|
|
|
|
139 |
else:
|
140 |
st.warning("Content for this week is not yet available.")
|
141 |
|
@@ -148,7 +151,7 @@ def main():
|
|
148 |
return
|
149 |
|
150 |
# User is logged in, show course content
|
151 |
-
if st.session_state.current_week in [1, 2]:
|
152 |
show_week_content()
|
153 |
else:
|
154 |
st.title("Data Science Research Paper Course")
|
|
|
15 |
# Import week pages
|
16 |
from app.pages import week_1
|
17 |
from app.pages import week_2
|
18 |
+
from app.pages import week_3
|
19 |
|
20 |
# Page configuration
|
21 |
st.set_page_config(
|
|
|
137 |
week_1.show()
|
138 |
elif st.session_state.current_week == 2:
|
139 |
week_2.show()
|
140 |
+
elif st.session_state.current_week == 3:
|
141 |
+
week_3.show()
|
142 |
else:
|
143 |
st.warning("Content for this week is not yet available.")
|
144 |
|
|
|
151 |
return
|
152 |
|
153 |
# User is logged in, show course content
|
154 |
+
if st.session_state.current_week in [1, 2, 3]:
|
155 |
show_week_content()
|
156 |
else:
|
157 |
st.title("Data Science Research Paper Course")
|
app/pages/__pycache__/week_1.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_1.cpython-311.pyc and b/app/pages/__pycache__/week_1.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_2.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_2.cpython-311.pyc and b/app/pages/__pycache__/week_2.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_3.cpython-311.pyc
ADDED
Binary file (15.6 kB). View file
|
|
app/pages/week_1.py
CHANGED
@@ -5,14 +5,122 @@ from sklearn.linear_model import LinearRegression
|
|
5 |
|
6 |
# Week 1 content in person
|
7 |
def show():
|
|
|
|
|
|
|
|
|
8 |
st.markdown("""
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
st.markdown("""
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
16 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
if __name__ == "__main__":
|
18 |
show()
|
|
|
5 |
|
6 |
# Week 1 content in person
|
7 |
def show():
|
8 |
+
st.title("Week 1: Developing Research Interests")
|
9 |
+
|
10 |
+
# Section 1: How to do research
|
11 |
+
st.header("1. Research Fundamentals")
|
12 |
st.markdown("""
|
13 |
+
|
14 |
+
### What is research?
|
15 |
+
Research is a systematic process of investigation to discover new knowledge or validate existing knowledge.
|
16 |
+
|
17 |
+
#### How to get started learning about your topic:
|
18 |
+
1. Websearch the topic area in general
|
19 |
+
1. Wikipedia
|
20 |
+
2. Google
|
21 |
+
3. Stanford Encyclopedia of Philosophy
|
22 |
+
4. News Reports
|
23 |
+
2. Look for specific topics in Google Scholar or other scholarly databases
|
24 |
+
1. [CNKI](https://www.cnki.net/index/)
|
25 |
+
3. Read research papers
|
26 |
+
|
27 |
+
- **Google Scholar**: Your gateway to academic literature
|
28 |
+
- Add key terms from your general search
|
29 |
+
- Look through academic papers
|
30 |
+
- Find data sources
|
31 |
+
- Utilitize literature reviews
|
32 |
+
|
33 |
""")
|
34 |
+
st.header("1A. How to Read a Research paper")
|
35 |
+
st.markdown("""
|
36 |
+
1. Introduction and conclusion/results are the most important sections
|
37 |
+
- These provide key context and findings
|
38 |
+
- Focus on these first for quick understanding
|
39 |
+
|
40 |
+
2. Approach and methodology sections are also critical
|
41 |
+
- Help understand how the research was conducted
|
42 |
+
- Important for evaluating validity
|
43 |
+
|
44 |
+
3. Citations and footnotes provide valuable context
|
45 |
+
- Help trace development of ideas
|
46 |
+
- Point to related work and background material
|
47 |
+
- Essential for understanding the broader research area
|
48 |
|
49 |
+
""")
|
50 |
+
col1, col2 = st.columns(2)
|
51 |
+
|
52 |
+
with col1:
|
53 |
+
st.header("1B. Literature Reviews")
|
54 |
+
st.markdown("""
|
55 |
+
- **Literature Review**:
|
56 |
+
- Full Literature Review
|
57 |
+
- General problem/task definition: What are these papers trying to solve, and why?
|
58 |
+
- Concise summaries of the articles: Do not simply copy the article text in full. We can read them ourselves. Put in your own words the major contributions of each article.
|
59 |
+
- Compare and contrast: Point out the similarities and differences of the papers. Do they agree with each other? Are results seemingly in conflict? If the papers address different subtasks, how are they related? (If they are not related, then you may have made poor choices for a lit review...). This section is probably the most valuable for the final project, as it can become the basis for a lit review section.
|
60 |
+
- Future work: Make several suggestions for how the work can be extended. Are there open questions to answer? This would presumably include how the papers relate to your final project idea.
|
61 |
+
- References section: The entries should appear alphabetically and give at least full author name(s), year of publication, title, and outlet if applicable (e.g., journal name or proceedings name). Beyond that, we are not picky about the format. Electronic references are fine but need to include the above information in addition to the link.[^1]
|
62 |
+
|
63 |
+
[^1]: Credit: Stanford CS224U
|
64 |
+
""")
|
65 |
+
|
66 |
+
with col2:
|
67 |
+
st.header("1C. Research Memos")
|
68 |
+
st.markdown("""
|
69 |
+
### Article Summary Memo
|
70 |
+
|
71 |
+
- The central research question (what are they studying?)
|
72 |
+
- The context of the study (where is the study taking place?)
|
73 |
+
- What type of data is being studied?
|
74 |
+
- What do the authors find?
|
75 |
+
|
76 |
+
*Credit: University Chicago – DPSS program*
|
77 |
+
""")
|
78 |
+
# Section 2: Research Question Formulation
|
79 |
+
st.header("2. Formulating Research Questions")
|
80 |
+
st.markdown("""
|
81 |
+
A good research question is the foundation of any research project. It should be:
|
82 |
+
- Clear and focused
|
83 |
+
- Researchable
|
84 |
+
- Feasible
|
85 |
+
- Significant
|
86 |
+
- Ethical
|
87 |
+
""")
|
88 |
+
|
89 |
+
# Display the research question image
|
90 |
+
st.image("assets/Pictures/research_question.jpg", caption="Research Question Formulation Framework")
|
91 |
+
|
92 |
st.markdown("""
|
93 |
+
### Steps to Formulate Your Research Question:
|
94 |
+
1. Start with a broad topic
|
95 |
+
2. Do preliminary research
|
96 |
+
3. Narrow down to specific aspects
|
97 |
+
4. Formulate your question
|
98 |
+
5. Refine and test your question
|
99 |
""")
|
100 |
+
|
101 |
+
# Section 3: Homework
|
102 |
+
st.header("3. Homework Assignment")
|
103 |
+
st.markdown("""
|
104 |
+
### Tasks for this week:
|
105 |
+
1. **Article Analysis**
|
106 |
+
- Provide a summary of the articles by answering the following questions:
|
107 |
+
- The central research question (what are they studying?)
|
108 |
+
- The context of the study (where is the study taking place?)
|
109 |
+
- What type of data is being studied?
|
110 |
+
- What do the authors find?
|
111 |
+
|
112 |
+
2. **Research Questions**
|
113 |
+
- Think about what research questions you would like to answer:
|
114 |
+
- What are the similarities between the studies?
|
115 |
+
- What are the differences between the studies?
|
116 |
+
- Come up with 5 potential research questions for your own research
|
117 |
+
|
118 |
+
3. **Reference Papers**
|
119 |
+
- [Peer Review](https://cogcomp.github.io/iclr_database/)
|
120 |
+
- [OpenDebateEvidence](https://arxiv.org/pdf/2406.14657)
|
121 |
+
|
122 |
+
|
123 |
+
""")
|
124 |
+
|
125 |
if __name__ == "__main__":
|
126 |
show()
|
app/pages/week_3.py
ADDED
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import io
|
7 |
+
import sys
|
8 |
+
from contextlib import redirect_stdout
|
9 |
+
|
10 |
+
# Initialize session state for notebook-like cells
|
11 |
+
if 'cells' not in st.session_state:
|
12 |
+
st.session_state.cells = []
|
13 |
+
if 'df' not in st.session_state:
|
14 |
+
st.session_state.df = None
|
15 |
+
|
16 |
+
def capture_output(code, df=None):
|
17 |
+
"""Helper function to capture print output"""
|
18 |
+
f = io.StringIO()
|
19 |
+
with redirect_stdout(f):
|
20 |
+
try:
|
21 |
+
# Create a dictionary of variables to use in exec
|
22 |
+
variables = {'pd': pd, 'np': np, 'plt': plt, 'sns': sns}
|
23 |
+
if df is not None:
|
24 |
+
variables['df'] = df
|
25 |
+
exec(code, variables)
|
26 |
+
except Exception as e:
|
27 |
+
return f"Error: {str(e)}"
|
28 |
+
return f.getvalue()
|
29 |
+
|
30 |
+
def show():
|
31 |
+
st.title("Week 3: Data Cleaning and Exploratory Data Analysis")
|
32 |
+
|
33 |
+
# Section 1: Introduction to EDA
|
34 |
+
st.header("1. Introduction to Exploratory Data Analysis")
|
35 |
+
st.markdown("""
|
36 |
+
Exploratory Data Analysis (EDA) is a crucial step in any data science project. Whether EDA is the main purpose of your project or is being used for feature selection/feature engineering in a machine learning context, it's important to understand the relationships between your features and target variables.
|
37 |
+
|
38 |
+
In this module, we'll focus on:
|
39 |
+
- Understanding categorical variables
|
40 |
+
- Data cleaning techniques
|
41 |
+
- Visualizing relationships in data
|
42 |
+
- Identifying patterns and insights
|
43 |
+
""")
|
44 |
+
|
45 |
+
# Section 2: The Titanic Dataset
|
46 |
+
st.header("2. Working with the Titanic Dataset")
|
47 |
+
st.markdown("""
|
48 |
+
We'll use the famous Titanic dataset to demonstrate data cleaning and EDA techniques. This dataset contains information about passengers aboard the Titanic and whether they survived.
|
49 |
+
|
50 |
+
### Dataset Description
|
51 |
+
| Variable | Definition | Key |
|
52 |
+
| -------- | ---------- | --- |
|
53 |
+
| survival | Survival | 0 = No, 1 = Yes |
|
54 |
+
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
|
55 |
+
| sex | Sex | |
|
56 |
+
| Age | Age in years | |
|
57 |
+
| sibsp | # of siblings / spouses aboard | |
|
58 |
+
| parch | # of parents / children aboard | |
|
59 |
+
| ticket | Ticket number | |
|
60 |
+
| fare | Passenger fare | |
|
61 |
+
| cabin | Cabin number | |
|
62 |
+
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |
|
63 |
+
""")
|
64 |
+
|
65 |
+
# Load and display the dataset
|
66 |
+
@st.cache_data
|
67 |
+
def load_data():
|
68 |
+
return pd.read_csv("https://raw.githubusercontent.com/hoffm386/eda-with-categorical-variables/master/titanic.csv")
|
69 |
+
|
70 |
+
df = load_data()
|
71 |
+
st.session_state.df = df
|
72 |
+
|
73 |
+
st.subheader("Dataset Preview")
|
74 |
+
st.dataframe(df.head())
|
75 |
+
|
76 |
+
# Interactive Data Loading Example
|
77 |
+
st.subheader("Try loading the data yourself!")
|
78 |
+
load_code = st.text_area("Try loading the Titanic dataset:",
|
79 |
+
'import pandas as pd\n\ndf = pd.read_csv("https://raw.githubusercontent.com/hoffm386/eda-with-categorical-variables/master/titanic.csv")\nprint(df.head())',
|
80 |
+
height=100)
|
81 |
+
st.code(load_code, language="python", line_numbers=True)
|
82 |
+
if st.button("Run Data Loading Code"):
|
83 |
+
output = capture_output(load_code, df)
|
84 |
+
st.code(output, language="python", line_numbers=True)
|
85 |
+
|
86 |
+
# Basic Dataset Information
|
87 |
+
st.subheader("Dataset Information")
|
88 |
+
st.markdown("""
|
89 |
+
Let's explore some basic information about our dataset. Try these commands:
|
90 |
+
""")
|
91 |
+
|
92 |
+
info_code = st.text_area("Try getting dataset information:",
|
93 |
+
'print("Dataset Shape:", df.shape)\nprint("\\nColumn Names:", df.columns.tolist())\nprint("\\nData Types:\\n", df.dtypes)\nprint("\\nMissing Values:\\n", df.isnull().sum())',
|
94 |
+
height=150)
|
95 |
+
st.code(info_code, language="python", line_numbers=True)
|
96 |
+
if st.button("Run Info Code"):
|
97 |
+
output = capture_output(info_code, df)
|
98 |
+
st.code(output, language="python", line_numbers=True)
|
99 |
+
|
100 |
+
# Section 3: Data Cleaning
|
101 |
+
st.header("3. Data Cleaning Techniques")
|
102 |
+
|
103 |
+
# Missing Value Handling
|
104 |
+
st.subheader("Missing Value Analysis")
|
105 |
+
st.markdown("""
|
106 |
+
Let's analyze and handle missing values in our dataset. Try these examples:
|
107 |
+
""")
|
108 |
+
|
109 |
+
missing_code = st.text_area("Try analyzing missing values:",
|
110 |
+
'missing_percent = (df.isnull().sum() / len(df)) * 100\nprint("Percentage of missing values:\\n", missing_percent[missing_percent > 0])\n\n# Try filling missing values\ndf_filled = df.copy()\ndf_filled["Age"].fillna(df_filled["Age"].median(), inplace=True)\nprint("\\nMissing values after filling Age:", df_filled["Age"].isnull().sum())',
|
111 |
+
height=150)
|
112 |
+
st.code(missing_code, language="python", line_numbers=True)
|
113 |
+
if st.button("Run Missing Value Code"):
|
114 |
+
output = capture_output(missing_code, df)
|
115 |
+
st.code(output, language="python", line_numbers=True)
|
116 |
+
|
117 |
+
# Data Type Conversion
|
118 |
+
st.subheader("Data Type Conversion")
|
119 |
+
st.markdown("""
|
120 |
+
Let's convert categorical variables to the appropriate data types:
|
121 |
+
""")
|
122 |
+
|
123 |
+
type_code = st.text_area("Try converting data types:",
|
124 |
+
'df_cat = df.copy()\ndf_cat["Sex"] = df_cat["Sex"].astype("category")\ndf_cat["Embarked"] = df_cat["Embarked"].astype("category")\nprint("Data types after conversion:\\n", df_cat.dtypes)',
|
125 |
+
height=100)
|
126 |
+
st.code(type_code, language="python", line_numbers=True)
|
127 |
+
if st.button("Run Type Conversion Code"):
|
128 |
+
output = capture_output(type_code, df)
|
129 |
+
st.code(output, language="python", line_numbers=True)
|
130 |
+
|
131 |
+
# Section 4: EDA with Categorical Variables
|
132 |
+
st.header("4. EDA with Categorical Variables")
|
133 |
+
|
134 |
+
# Interactive Visualizations
|
135 |
+
st.subheader("Create Your Own Visualizations")
|
136 |
+
st.markdown("""
|
137 |
+
Let's explore different types of visualizations to understand our data better:
|
138 |
+
|
139 |
+
1. **Basic Count Plots**
|
140 |
+
First, let's look at the distribution of passengers by class and survival:
|
141 |
+
""")
|
142 |
+
|
143 |
+
viz_code = st.text_area("Try creating basic visualizations:",
|
144 |
+
'''import matplotlib.pyplot as plt
|
145 |
+
import seaborn as sns
|
146 |
+
|
147 |
+
# Create a figure with two subplots
|
148 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
|
149 |
+
|
150 |
+
# Count plot for Sex
|
151 |
+
sns.countplot(data=df, x="Sex", ax=ax1)
|
152 |
+
ax1.set_title("Passenger Count by Sex")
|
153 |
+
|
154 |
+
# Bar plot for survival rate by Pclass
|
155 |
+
sns.barplot(data=df, x="Pclass", y="Survived", ax=ax2)
|
156 |
+
ax2.set_title("Survival Rate by Passenger Class")
|
157 |
+
|
158 |
+
plt.tight_layout()
|
159 |
+
st.pyplot(fig)''',
|
160 |
+
height=200)
|
161 |
+
st.code(viz_code, language="python", line_numbers=True)
|
162 |
+
if st.button("Run Basic Visualization Code"):
|
163 |
+
output = capture_output(viz_code, df)
|
164 |
+
st.pyplot(plt.gcf())
|
165 |
+
|
166 |
+
# Advanced Visualizations
|
167 |
+
st.subheader("Advanced Visualizations")
|
168 |
+
st.markdown("""
|
169 |
+
Now let's create more complex visualizations to understand relationships between variables:
|
170 |
+
|
171 |
+
2. **Survival Analysis by Class**
|
172 |
+
Let's analyze survival rates across different passenger classes with a stacked bar chart:
|
173 |
+
""")
|
174 |
+
|
175 |
+
advanced_viz_code = st.text_area("Try creating advanced visualizations:",
|
176 |
+
'''import matplotlib.pyplot as plt
|
177 |
+
import seaborn as sns
|
178 |
+
from matplotlib.patches import Patch
|
179 |
+
|
180 |
+
# Create figure and axis
|
181 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
182 |
+
|
183 |
+
# Create countplot with custom colors
|
184 |
+
sns.countplot(x="Pclass", hue="Survived", data=df,
|
185 |
+
palette={1: "blue", 0: "red"}, ax=ax)
|
186 |
+
|
187 |
+
# Customize the plot
|
188 |
+
ax.set_xlabel("Passenger Class")
|
189 |
+
ax.set_title("Survival Distribution by Passenger Class")
|
190 |
+
|
191 |
+
# Create custom legend
|
192 |
+
legend_elements = [
|
193 |
+
Patch(facecolor="blue", label="Survived"),
|
194 |
+
Patch(facecolor="red", label="Did Not Survive")
|
195 |
+
]
|
196 |
+
ax.legend(handles=legend_elements)
|
197 |
+
|
198 |
+
plt.tight_layout()
|
199 |
+
st.pyplot(fig)
|
200 |
+
|
201 |
+
# Create a second figure for percentage analysis
|
202 |
+
fig2, ax2 = plt.subplots(figsize=(10, 6))
|
203 |
+
|
204 |
+
# Calculate percentages
|
205 |
+
survival_by_class = df.groupby("Pclass")["Survived"].value_counts(normalize=True).unstack()
|
206 |
+
survival_by_class.plot(kind="bar", stacked=True, ax=ax2)
|
207 |
+
|
208 |
+
# Customize the plot
|
209 |
+
ax2.set_xlabel("Passenger Class")
|
210 |
+
ax2.set_ylabel("Percentage")
|
211 |
+
ax2.set_title("Survival Rate by Passenger Class")
|
212 |
+
ax2.legend(title="Survived", labels=["No", "Yes"])
|
213 |
+
|
214 |
+
plt.tight_layout()
|
215 |
+
st.pyplot(fig2)''',
|
216 |
+
height=400)
|
217 |
+
st.code(advanced_viz_code, language="python", line_numbers=True)
|
218 |
+
if st.button("Run Advanced Visualization Code"):
|
219 |
+
output = capture_output(advanced_viz_code, df)
|
220 |
+
st.pyplot(plt.gcf())
|
221 |
+
|
222 |
+
# Age Distribution Analysis
|
223 |
+
st.subheader("Age Distribution Analysis")
|
224 |
+
st.markdown("""
|
225 |
+
3. **Age Distribution by Survival**
|
226 |
+
Let's examine how age relates to survival:
|
227 |
+
""")
|
228 |
+
|
229 |
+
age_viz_code = st.text_area("Try creating age distribution visualizations:",
|
230 |
+
'''import matplotlib.pyplot as plt
|
231 |
+
|
232 |
+
# Create figure and axis
|
233 |
+
fig, ax = plt.subplots()
|
234 |
+
|
235 |
+
# Plot histograms for survived and non-survived passengers
|
236 |
+
ax.hist(df[df["Survived"]==1]["Age"], bins=15, alpha=0.5, color="blue", label="survived")
|
237 |
+
ax.hist(df[df["Survived"]==0]["Age"], bins=15, alpha=0.5, color="green", label="did not survive")
|
238 |
+
|
239 |
+
# Customize the plot
|
240 |
+
ax.set_xlabel("Age")
|
241 |
+
ax.set_ylabel("Count of passengers")
|
242 |
+
ax.set_title("Age vs. Survival for Titanic Passengers")
|
243 |
+
ax.legend()
|
244 |
+
|
245 |
+
plt.tight_layout()
|
246 |
+
st.pyplot(fig)''',
|
247 |
+
height=200)
|
248 |
+
st.code(age_viz_code, language="python", line_numbers=True)
|
249 |
+
if st.button("Run Age Distribution Code"):
|
250 |
+
output = capture_output(age_viz_code, df)
|
251 |
+
st.pyplot(plt.gcf())
|
252 |
+
|
253 |
+
# Age and Fare Analysis
|
254 |
+
st.subheader("Age and Fare Analysis")
|
255 |
+
st.markdown("""
|
256 |
+
4. **Survival by Age and Fare**
|
257 |
+
Let's analyze how both age and fare relate to survival:
|
258 |
+
""")
|
259 |
+
|
260 |
+
age_fare_viz_code = st.text_area("Try creating age and fare visualizations:",
|
261 |
+
'''import matplotlib.pyplot as plt
|
262 |
+
from matplotlib.lines import Line2D
|
263 |
+
|
264 |
+
# Create figure and axis
|
265 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
266 |
+
|
267 |
+
# Plot scatter points for survived and non-survived passengers
|
268 |
+
ax.scatter(df[df["Survived"]==1]["Age"], df[df["Survived"]==1]["Fare"],
|
269 |
+
c="blue", alpha=0.5, label="survived")
|
270 |
+
ax.scatter(df[df["Survived"]==0]["Age"], df[df["Survived"]==0]["Fare"],
|
271 |
+
c="green", alpha=0.5, label="did not survive")
|
272 |
+
|
273 |
+
# Customize the plot
|
274 |
+
ax.set_xlabel("Age")
|
275 |
+
ax.set_ylabel("Fare")
|
276 |
+
ax.set_title("Survival by Age and Fare for Titanic Passengers")
|
277 |
+
|
278 |
+
# Create custom legend
|
279 |
+
color_patches = [
|
280 |
+
Line2D([0], [0], marker='o', color='w', label='survived',
|
281 |
+
markerfacecolor='b', markersize=10),
|
282 |
+
Line2D([0], [0], marker='o', color='w', label='did not survive',
|
283 |
+
markerfacecolor='g', markersize=10)
|
284 |
+
]
|
285 |
+
ax.legend(handles=color_patches)
|
286 |
+
|
287 |
+
plt.tight_layout()
|
288 |
+
st.pyplot(fig)''',
|
289 |
+
height=250)
|
290 |
+
st.code(age_fare_viz_code, language="python", line_numbers=True)
|
291 |
+
if st.button("Run Age and Fare Visualization Code"):
|
292 |
+
output = capture_output(age_fare_viz_code, df)
|
293 |
+
st.pyplot(plt.gcf())
|
294 |
+
|
295 |
+
# Section 5: Hands-on Exercise
|
296 |
+
st.header("5. Hands-on Exercise")
|
297 |
+
st.markdown("""
|
298 |
+
### Tasks for this week:
|
299 |
+
|
300 |
+
1. **Data Cleaning Exercise**
|
301 |
+
- Load the Titanic dataset
|
302 |
+
- Identify and handle missing values
|
303 |
+
- Convert categorical variables
|
304 |
+
- Create summary statistics
|
305 |
+
|
306 |
+
2. **EDA Analysis**
|
307 |
+
- Create visualizations for key variables
|
308 |
+
- Analyze relationships between variables
|
309 |
+
- Identify patterns in survival rates
|
310 |
+
|
311 |
+
3. **Report Writing**
|
312 |
+
- Document your findings
|
313 |
+
- Create a presentation of key insights
|
314 |
+
- Suggest potential next steps
|
315 |
+
""")
|
316 |
+
|
317 |
+
# Interactive Exercise
|
318 |
+
st.subheader("Try Your Own Analysis")
|
319 |
+
exercise_code = st.text_area("Write your own analysis code here:",
|
320 |
+
'# Your code here\n# Try analyzing the relationship between Age and Survival\n# Or create your own visualizations\n# Or perform any other analysis you find interesting',
|
321 |
+
height=150)
|
322 |
+
st.code(exercise_code, language="python", line_numbers=True)
|
323 |
+
if st.button("Run Exercise Code"):
|
324 |
+
output = capture_output(exercise_code, df)
|
325 |
+
st.code(output, language="python", line_numbers=True)
|
326 |
+
|
327 |
+
# Section 6: Resources
|
328 |
+
st.header("6. Additional Resources")
|
329 |
+
st.markdown("""
|
330 |
+
- [EDA with Categorical Variables](https://github.com/hoffm386/eda-with-categorical-variables)
|
331 |
+
- [Kaggle EDA Tutorial](https://www.kaggle.com/code/kashnitsky/topic-1-exploratory-data-analysis-with-pandas)
|
332 |
+
- [Pandas Documentation](https://pandas.pydata.org/docs/)
|
333 |
+
- [Seaborn Documentation](https://seaborn.pydata.org/)
|
334 |
+
""")
|
335 |
+
|
336 |
+
if __name__ == "__main__":
|
337 |
+
show()
|
assets/Pictures/research_question.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/Pictures/research_question.jpg
ADDED
![]() |