mdik1 commited on
Commit
da0e299
·
verified ·
1 Parent(s): 4025460

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +256 -227
app.py CHANGED
@@ -1,227 +1,256 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import seaborn as sns
4
- import matplotlib.pyplot as plt
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.preprocessing import StandardScaler, LabelEncoder
7
- from sklearn.ensemble import RandomForestClassifier
8
- from sklearn.metrics import classification_report, accuracy_score
9
- import nbformat as nbf
10
- import io
11
- import sqlite3
12
- from io import StringIO
13
- import os
14
-
15
- # Constants
16
- DB_PATH = "db/database.db"
17
- TEMP_DIR = "temp/"
18
-
19
- # Ensure directories exist
20
- os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
21
- os.makedirs(TEMP_DIR, exist_ok=True)
22
-
23
- # Initialize SQLite database
24
- def init_db():
25
- conn = sqlite3.connect(DB_PATH)
26
- cursor = conn.cursor()
27
- cursor.execute("""
28
- CREATE TABLE IF NOT EXISTS datasets (
29
- id INTEGER PRIMARY KEY AUTOINCREMENT,
30
- name TEXT NOT NULL,
31
- content TEXT NOT NULL
32
- )
33
- """)
34
- conn.commit()
35
- conn.close()
36
-
37
- # Save dataset to SQLite
38
- def save_dataset_to_db(name, content):
39
- conn = sqlite3.connect(DB_PATH)
40
- cursor = conn.cursor()
41
- cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
42
- conn.commit()
43
- conn.close()
44
-
45
- # Fetch all datasets from SQLite
46
- def get_datasets():
47
- conn = sqlite3.connect(DB_PATH)
48
- cursor = conn.cursor()
49
- cursor.execute("SELECT id, name FROM datasets")
50
- datasets = cursor.fetchall()
51
- conn.close()
52
- return datasets
53
-
54
- # Load dataset by ID
55
- def load_dataset_from_db(dataset_id):
56
- conn = sqlite3.connect(DB_PATH)
57
- cursor = conn.cursor()
58
- cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
59
- content = cursor.fetchone()
60
- conn.close()
61
- if content:
62
- return StringIO(content[0])
63
- return None
64
-
65
- # Initialize database
66
- init_db()
67
-
68
- # Function to detect problem type
69
- def detect_problem_type(df, target_column):
70
- if target_column not in df.columns:
71
- return "Error: Target column not found in the dataset."
72
-
73
- df_clean = df.dropna(subset=[target_column])
74
- unique_values = df_clean[target_column].nunique()
75
- if unique_values == 2:
76
- return "binary_classification"
77
- elif unique_values > 2:
78
- return "multiclass_classification"
79
- else:
80
- return "Error: Invalid target column (not enough unique values)."
81
-
82
- # Function to generate notebook content
83
- def generate_notebook_code(csv_path, target_column, problem_type):
84
- notebook = nbf.v4.new_notebook()
85
- code = f"""
86
- import pandas as pd
87
- import seaborn as sns
88
- import matplotlib.pyplot as plt
89
- from sklearn.model_selection import train_test_split
90
- from sklearn.preprocessing import StandardScaler, LabelEncoder
91
- from sklearn.ensemble import RandomForestClassifier
92
- from sklearn.metrics import classification_report, accuracy_score
93
-
94
- # Load Dataset
95
- df = pd.read_csv("{csv_path}")
96
- target_column = "{target_column}"
97
-
98
- # Display the first few rows
99
- print(df.head())
100
-
101
- # Check for missing values
102
- print("Missing Values:\\n", df.isnull().sum())
103
-
104
- # Encode categorical columns
105
- categorical_cols = df.select_dtypes(include=['object']).columns
106
- for col in categorical_cols:
107
- df[col] = LabelEncoder().fit_transform(df[col])
108
-
109
- # Fill missing values with median
110
- df.fillna(df.median(), inplace=True)
111
-
112
- # Split data into features and target
113
- X = df.drop(columns=[target_column])
114
- y = df[target_column]
115
-
116
- # Standardize numeric columns
117
- scaler = StandardScaler()
118
- X = scaler.fit_transform(X)
119
-
120
- # Train/Test Split
121
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
122
-
123
- # Train Model
124
- model = RandomForestClassifier() if "{problem_type}" in ["binary_classification", "multiclass_classification"] else None
125
- model.fit(X_train, y_train)
126
-
127
- # Predict and Evaluate
128
- y_pred = model.predict(X_test)
129
- print("Accuracy Score:", accuracy_score(y_test, y_pred))
130
- print("Classification Report:\\n", classification_report(y_test, y_pred))
131
- """
132
- notebook.cells.append(nbf.v4.new_code_cell(code))
133
- return notebook
134
-
135
- # Streamlit app
136
- st.title("Automated Data Science App")
137
- st.write("Upload a CSV file and specify the target column to automatically process and train models.")
138
-
139
- # File upload
140
- uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
141
- target_column = st.text_input("Enter the target column name")
142
-
143
- if uploaded_file and target_column:
144
- try:
145
- df = pd.read_csv(uploaded_file)
146
- st.write("Dataset Preview:")
147
- st.write(df.head())
148
-
149
- st.subheader("Missing Values")
150
- st.write(df.isnull().sum())
151
-
152
- st.subheader("Basic Statistics")
153
- st.write(df.describe())
154
-
155
- problem_type = detect_problem_type(df, target_column)
156
- if "Error" in problem_type:
157
- st.error(problem_type)
158
- else:
159
- st.write(f"Detected Problem Type: {problem_type}")
160
-
161
- # Save dataset to database
162
- save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
163
-
164
- categorical_cols = df.select_dtypes(include=['object']).columns
165
- for col in categorical_cols:
166
- df[col] = LabelEncoder().fit_transform(df[col])
167
-
168
- df.fillna(df.median(), inplace=True)
169
- X = df.drop(columns=[target_column])
170
- y = df[target_column]
171
-
172
- scaler = StandardScaler()
173
- X = scaler.fit_transform(X)
174
-
175
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
176
-
177
- model = RandomForestClassifier()
178
- model.fit(X_train, y_train)
179
-
180
- # Evaluate model
181
- y_pred = model.predict(X_test)
182
- st.subheader("Model Performance")
183
- st.write("Accuracy:", accuracy_score(y_test, y_pred))
184
-
185
- # Display the classification report with proper formatting
186
- st.write("Classification Report:")
187
- report = classification_report(y_test, y_pred)
188
- st.code(report) # st.text ensures the report is displayed with proper formatting
189
-
190
- feature_importances = model.feature_importances_
191
- important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
192
- important_features = important_features.sort_values(ascending=False).head(5)
193
-
194
- st.subheader("Important Features")
195
- st.write(important_features)
196
-
197
- st.subheader("Visualizations")
198
- for feature in important_features.index:
199
- st.write(f"Box Plot for {feature}")
200
- fig, ax = plt.subplots(figsize=(8, 6))
201
- sns.boxplot(x=y, y=df[feature], ax=ax)
202
- st.pyplot(fig)
203
-
204
- st.write(f"Histogram for {feature}")
205
- fig, ax = plt.subplots(figsize=(8, 6))
206
- sns.histplot(df[feature], kde=True, bins=30, ax=ax)
207
- st.pyplot(fig)
208
-
209
- temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
210
- with open(temp_csv_path, "w") as f:
211
- f.write(uploaded_file.getvalue().decode("utf-8"))
212
-
213
- notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
214
- notebook_buffer = io.StringIO()
215
- nbf.write(notebook, notebook_buffer)
216
- notebook_buffer.seek(0)
217
- notebook_content = notebook_buffer.getvalue()
218
-
219
- st.download_button(
220
- label="Download Code Notebook",
221
- data=notebook_content,
222
- file_name="data_science_pipeline.ipynb",
223
- mime="application/json"
224
- )
225
-
226
- except Exception as e:
227
- st.error(f"An error occurred: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+ from sklearn.ensemble import RandomForestClassifier
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.svm import SVC
10
+ from sklearn.tree import DecisionTreeClassifier
11
+ from sklearn.metrics import classification_report, accuracy_score
12
+ import nbformat as nbf
13
+ import io
14
+ import sqlite3
15
+ from io import StringIO
16
+ import os
17
+
18
+ # Constants
19
+ DB_PATH = "db/database.db"
20
+ TEMP_DIR = "temp/"
21
+
22
+ # Ensure directories exist
23
+ os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
24
+ os.makedirs(TEMP_DIR, exist_ok=True)
25
+
26
+ # Initialize SQLite database
27
+ def init_db():
28
+ conn = sqlite3.connect(DB_PATH)
29
+ cursor = conn.cursor()
30
+ cursor.execute("""
31
+ CREATE TABLE IF NOT EXISTS datasets (
32
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
33
+ name TEXT NOT NULL,
34
+ content TEXT NOT NULL
35
+ )
36
+ """)
37
+ conn.commit()
38
+ conn.close()
39
+
40
+ # Save dataset to SQLite
41
+ def save_dataset_to_db(name, content):
42
+ conn = sqlite3.connect(DB_PATH)
43
+ cursor = conn.cursor()
44
+ cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
45
+ conn.commit()
46
+ conn.close()
47
+
48
+ # Fetch all datasets from SQLite
49
+ def get_datasets():
50
+ conn = sqlite3.connect(DB_PATH)
51
+ cursor = conn.cursor()
52
+ cursor.execute("SELECT id, name FROM datasets")
53
+ datasets = cursor.fetchall()
54
+ conn.close()
55
+ return datasets
56
+
57
+ # Load dataset by ID
58
+ def load_dataset_from_db(dataset_id):
59
+ conn = sqlite3.connect(DB_PATH)
60
+ cursor = conn.cursor()
61
+ cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
62
+ content = cursor.fetchone()
63
+ conn.close()
64
+ if content:
65
+ return StringIO(content[0])
66
+ return None
67
+
68
+ # Initialize database
69
+ init_db()
70
+
71
+ # Function to detect problem type
72
+ def detect_problem_type(df, target_column):
73
+ if target_column not in df.columns:
74
+ return "Error: Target column not found in the dataset."
75
+
76
+ df_clean = df.dropna(subset=[target_column])
77
+ unique_values = df_clean[target_column].nunique()
78
+ if unique_values == 2:
79
+ return "binary_classification"
80
+ elif unique_values > 2:
81
+ return "multiclass_classification"
82
+ else:
83
+ return "Error: Invalid target column (not enough unique values)."
84
+
85
+ # Function to generate notebook content
86
+ def generate_notebook_code(csv_path, target_column, problem_type):
87
+ notebook = nbf.v4.new_notebook()
88
+ code = f"""
89
+ import pandas as pd
90
+ import seaborn as sns
91
+ import matplotlib.pyplot as plt
92
+ from sklearn.model_selection import train_test_split
93
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
94
+ from sklearn.ensemble import RandomForestClassifier
95
+ from sklearn.linear_model import LogisticRegression
96
+ from sklearn.svm import SVC
97
+ from sklearn.tree import DecisionTreeClassifier
98
+ from sklearn.metrics import classification_report, accuracy_score
99
+
100
+ # Load Dataset
101
+ df = pd.read_csv("{csv_path}")
102
+ target_column = "{target_column}"
103
+
104
+ # Display the first few rows
105
+ print(df.head())
106
+
107
+ # Check for missing values
108
+ print("Missing Values:\\n", df.isnull().sum())
109
+
110
+ # Encode categorical columns
111
+ categorical_cols = df.select_dtypes(include=['object']).columns
112
+ for col in categorical_cols:
113
+ df[col] = LabelEncoder().fit_transform(df[col])
114
+
115
+ # Fill missing values with median
116
+ df.fillna(df.median(), inplace=True)
117
+
118
+ # Split data into features and target
119
+ X = df.drop(columns=[target_column])
120
+ y = df[target_column]
121
+
122
+ # Standardize numeric columns
123
+ scaler = StandardScaler()
124
+ X = scaler.fit_transform(X)
125
+
126
+ # Train/Test Split
127
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
128
+
129
+ # Train Models
130
+ models = []
131
+ if "{problem_type}" in ["binary_classification", "multiclass_classification"]:
132
+ models.append(("Random Forest", RandomForestClassifier()))
133
+ models.append(("Logistic Regression", LogisticRegression()))
134
+ models.append(("SVM", SVC()))
135
+ models.append(("Decision Tree", DecisionTreeClassifier()))
136
+
137
+ # Model Evaluation
138
+ results = []
139
+ for model_name, model in models:
140
+ model.fit(X_train, y_train)
141
+ y_pred = model.predict(X_test)
142
+ accuracy = accuracy_score(y_test, y_pred)
143
+ results.append((model_name, accuracy))
144
+
145
+ print("Model Performance:")
146
+ for model_name, accuracy in results:
147
+ print(f"{model_name}: {accuracy}")
148
+ """
149
+ notebook.cells.append(nbf.v4.new_code_cell(code))
150
+ return notebook
151
+
152
+ # Streamlit app
153
+ st.title("Automated Data Science App")
154
+ st.write("Upload a CSV file and specify the target column to automatically process and train models.")
155
+
156
+ # File upload
157
+ uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
158
+ target_column = st.text_input("Enter the target column name")
159
+
160
+ if uploaded_file and target_column:
161
+ try:
162
+ df = pd.read_csv(uploaded_file)
163
+ st.write("Dataset Preview:")
164
+ st.write(df.head())
165
+
166
+ st.subheader("Missing Values")
167
+ st.write(df.isnull().sum())
168
+
169
+ st.subheader("Basic Statistics")
170
+ st.write(df.describe())
171
+
172
+ problem_type = detect_problem_type(df, target_column)
173
+ if "Error" in problem_type:
174
+ st.error(problem_type)
175
+ else:
176
+ st.write(f"Detected Problem Type: {problem_type}")
177
+
178
+ # Save dataset to database
179
+ save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
180
+
181
+ categorical_cols = df.select_dtypes(include=['object']).columns
182
+ for col in categorical_cols:
183
+ df[col] = LabelEncoder().fit_transform(df[col])
184
+
185
+ df.fillna(df.median(), inplace=True)
186
+ X = df.drop(columns=[target_column])
187
+ y = df[target_column]
188
+
189
+ scaler = StandardScaler()
190
+ X = scaler.fit_transform(X)
191
+
192
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
193
+
194
+ models = [
195
+ ("Random Forest", RandomForestClassifier()),
196
+ ("Logistic Regression", LogisticRegression()),
197
+ ("SVM", SVC()),
198
+ ("Decision Tree", DecisionTreeClassifier())
199
+ ]
200
+
201
+ results = []
202
+ for model_name, model in models:
203
+ model.fit(X_train, y_train)
204
+ y_pred = model.predict(X_test)
205
+ accuracy = accuracy_score(y_test, y_pred)
206
+ results.append((model_name, accuracy))
207
+
208
+ # Display results in a table
209
+ st.subheader("Model Performance")
210
+ results_df = pd.DataFrame(results, columns=["Model Name", "Accuracy"])
211
+ st.write(results_df)
212
+
213
+ # Display the classification report with proper formatting
214
+ st.subheader("Classification Report")
215
+ report = classification_report(y_test, y_pred)
216
+ st.code(report) # st.text ensures the report is displayed with proper formatting
217
+
218
+ feature_importances = model.feature_importances_ if hasattr(model, "feature_importances_") else None
219
+ if feature_importances is not None:
220
+ important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
221
+ important_features = important_features.sort_values(ascending=False).head(5)
222
+
223
+ st.subheader("Important Features")
224
+ st.write(important_features)
225
+
226
+ st.subheader("Visualizations")
227
+ for feature in important_features.index:
228
+ st.write(f"Box Plot for {feature}")
229
+ fig, ax = plt.subplots(figsize=(8, 6))
230
+ sns.boxplot(x=y, y=df[feature], ax=ax)
231
+ st.pyplot(fig)
232
+
233
+ st.write(f"Histogram for {feature}")
234
+ fig, ax = plt.subplots(figsize=(8, 6))
235
+ sns.histplot(df[feature], kde=True, bins=30, ax=ax)
236
+ st.pyplot(fig)
237
+
238
+ temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
239
+ with open(temp_csv_path, "w") as f:
240
+ f.write(uploaded_file.getvalue().decode("utf-8"))
241
+
242
+ notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
243
+ notebook_buffer = io.StringIO()
244
+ nbf.write(notebook, notebook_buffer)
245
+ notebook_buffer.seek(0)
246
+ notebook_content = notebook_buffer.getvalue()
247
+
248
+ st.download_button(
249
+ label="Download Code Notebook",
250
+ data=notebook_content,
251
+ file_name="data_science_pipeline.ipynb",
252
+ mime="application/json"
253
+ )
254
+
255
+ except Exception as e:
256
+ st.error(f"An error occurred: {e}")