mdik1 commited on
Commit
b80a552
·
verified ·
1 Parent(s): 4c0f415

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +12 -12
  2. app.py +227 -0
  3. download_files_app.py +47 -0
  4. requirements.txt +7 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Automatic Data Science
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.41.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Automatic Data Science
3
+ emoji: 🔥
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.41.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+ from sklearn.ensemble import RandomForestClassifier
8
+ from sklearn.metrics import classification_report, accuracy_score
9
+ import nbformat as nbf
10
+ import io
11
+ import sqlite3
12
+ from io import StringIO
13
+ import os
14
+
15
+ # Constants
16
+ DB_PATH = "db/database.db"
17
+ TEMP_DIR = "temp/"
18
+
19
+ # Ensure directories exist
20
+ os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
21
+ os.makedirs(TEMP_DIR, exist_ok=True)
22
+
23
+ # Initialize SQLite database
24
+ def init_db():
25
+ conn = sqlite3.connect(DB_PATH)
26
+ cursor = conn.cursor()
27
+ cursor.execute("""
28
+ CREATE TABLE IF NOT EXISTS datasets (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ name TEXT NOT NULL,
31
+ content TEXT NOT NULL
32
+ )
33
+ """)
34
+ conn.commit()
35
+ conn.close()
36
+
37
+ # Save dataset to SQLite
38
+ def save_dataset_to_db(name, content):
39
+ conn = sqlite3.connect(DB_PATH)
40
+ cursor = conn.cursor()
41
+ cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ # Fetch all datasets from SQLite
46
+ def get_datasets():
47
+ conn = sqlite3.connect(DB_PATH)
48
+ cursor = conn.cursor()
49
+ cursor.execute("SELECT id, name FROM datasets")
50
+ datasets = cursor.fetchall()
51
+ conn.close()
52
+ return datasets
53
+
54
+ # Load dataset by ID
55
+ def load_dataset_from_db(dataset_id):
56
+ conn = sqlite3.connect(DB_PATH)
57
+ cursor = conn.cursor()
58
+ cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
59
+ content = cursor.fetchone()
60
+ conn.close()
61
+ if content:
62
+ return StringIO(content[0])
63
+ return None
64
+
65
+ # Initialize database
66
+ init_db()
67
+
68
+ # Function to detect problem type
69
+ def detect_problem_type(df, target_column):
70
+ if target_column not in df.columns:
71
+ return "Error: Target column not found in the dataset."
72
+
73
+ df_clean = df.dropna(subset=[target_column])
74
+ unique_values = df_clean[target_column].nunique()
75
+ if unique_values == 2:
76
+ return "binary_classification"
77
+ elif unique_values > 2:
78
+ return "multiclass_classification"
79
+ else:
80
+ return "Error: Invalid target column (not enough unique values)."
81
+
82
+ # Function to generate notebook content
83
+ def generate_notebook_code(csv_path, target_column, problem_type):
84
+ notebook = nbf.v4.new_notebook()
85
+ code = f"""
86
+ import pandas as pd
87
+ import seaborn as sns
88
+ import matplotlib.pyplot as plt
89
+ from sklearn.model_selection import train_test_split
90
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
91
+ from sklearn.ensemble import RandomForestClassifier
92
+ from sklearn.metrics import classification_report, accuracy_score
93
+
94
+ # Load Dataset
95
+ df = pd.read_csv("{csv_path}")
96
+ target_column = "{target_column}"
97
+
98
+ # Display the first few rows
99
+ print(df.head())
100
+
101
+ # Check for missing values
102
+ print("Missing Values:\\n", df.isnull().sum())
103
+
104
+ # Encode categorical columns
105
+ categorical_cols = df.select_dtypes(include=['object']).columns
106
+ for col in categorical_cols:
107
+ df[col] = LabelEncoder().fit_transform(df[col])
108
+
109
+ # Fill missing values with median
110
+ df.fillna(df.median(), inplace=True)
111
+
112
+ # Split data into features and target
113
+ X = df.drop(columns=[target_column])
114
+ y = df[target_column]
115
+
116
+ # Standardize numeric columns
117
+ scaler = StandardScaler()
118
+ X = scaler.fit_transform(X)
119
+
120
+ # Train/Test Split
121
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
122
+
123
+ # Train Model
124
+ model = RandomForestClassifier() if "{problem_type}" in ["binary_classification", "multiclass_classification"] else None
125
+ model.fit(X_train, y_train)
126
+
127
+ # Predict and Evaluate
128
+ y_pred = model.predict(X_test)
129
+ print("Accuracy Score:", accuracy_score(y_test, y_pred))
130
+ print("Classification Report:\\n", classification_report(y_test, y_pred))
131
+ """
132
+ notebook.cells.append(nbf.v4.new_code_cell(code))
133
+ return notebook
134
+
135
+ # Streamlit app
136
+ st.title("Automated Data Science App")
137
+ st.write("Upload a CSV file and specify the target column to automatically process and train models.")
138
+
139
+ # File upload
140
+ uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
141
+ target_column = st.text_input("Enter the target column name")
142
+
143
+ if uploaded_file and target_column:
144
+ try:
145
+ df = pd.read_csv(uploaded_file)
146
+ st.write("Dataset Preview:")
147
+ st.write(df.head())
148
+
149
+ st.subheader("Missing Values")
150
+ st.write(df.isnull().sum())
151
+
152
+ st.subheader("Basic Statistics")
153
+ st.write(df.describe())
154
+
155
+ problem_type = detect_problem_type(df, target_column)
156
+ if "Error" in problem_type:
157
+ st.error(problem_type)
158
+ else:
159
+ st.write(f"Detected Problem Type: {problem_type}")
160
+
161
+ # Save dataset to database
162
+ save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
163
+
164
+ categorical_cols = df.select_dtypes(include=['object']).columns
165
+ for col in categorical_cols:
166
+ df[col] = LabelEncoder().fit_transform(df[col])
167
+
168
+ df.fillna(df.median(), inplace=True)
169
+ X = df.drop(columns=[target_column])
170
+ y = df[target_column]
171
+
172
+ scaler = StandardScaler()
173
+ X = scaler.fit_transform(X)
174
+
175
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
176
+
177
+ model = RandomForestClassifier()
178
+ model.fit(X_train, y_train)
179
+
180
+ # Evaluate model
181
+ y_pred = model.predict(X_test)
182
+ st.subheader("Model Performance")
183
+ st.write("Accuracy:", accuracy_score(y_test, y_pred))
184
+
185
+ # Display the classification report with proper formatting
186
+ st.write("Classification Report:")
187
+ report = classification_report(y_test, y_pred)
188
+ st.code(report) # st.text ensures the report is displayed with proper formatting
189
+
190
+ feature_importances = model.feature_importances_
191
+ important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
192
+ important_features = important_features.sort_values(ascending=False).head(5)
193
+
194
+ st.subheader("Important Features")
195
+ st.write(important_features)
196
+
197
+ st.subheader("Visualizations")
198
+ for feature in important_features.index:
199
+ st.write(f"Box Plot for {feature}")
200
+ fig, ax = plt.subplots(figsize=(8, 6))
201
+ sns.boxplot(x=y, y=df[feature], ax=ax)
202
+ st.pyplot(fig)
203
+
204
+ st.write(f"Histogram for {feature}")
205
+ fig, ax = plt.subplots(figsize=(8, 6))
206
+ sns.histplot(df[feature], kde=True, bins=30, ax=ax)
207
+ st.pyplot(fig)
208
+
209
+ temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
210
+ with open(temp_csv_path, "w") as f:
211
+ f.write(uploaded_file.getvalue().decode("utf-8"))
212
+
213
+ notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
214
+ notebook_buffer = io.StringIO()
215
+ nbf.write(notebook, notebook_buffer)
216
+ notebook_buffer.seek(0)
217
+ notebook_content = notebook_buffer.getvalue()
218
+
219
+ st.download_button(
220
+ label="Download Code Notebook",
221
+ data=notebook_content,
222
+ file_name="data_science_pipeline.ipynb",
223
+ mime="application/json"
224
+ )
225
+
226
+ except Exception as e:
227
+ st.error(f"An error occurred: {e}")
download_files_app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sqlite3
3
+ from io import StringIO
4
+
5
+ # Constants
6
+ DB_PATH = "db/database.db"
7
+
8
+ # Fetch all datasets from SQLite
9
+ def get_datasets():
10
+ conn = sqlite3.connect(DB_PATH)
11
+ cursor = conn.cursor()
12
+ cursor.execute("SELECT id, name FROM datasets")
13
+ datasets = cursor.fetchall()
14
+ conn.close()
15
+ return datasets
16
+
17
+ # Load dataset content by ID
18
+ def load_dataset_content(dataset_id):
19
+ conn = sqlite3.connect(DB_PATH)
20
+ cursor = conn.cursor()
21
+ cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
22
+ content = cursor.fetchone()
23
+ conn.close()
24
+ if content:
25
+ return content[0]
26
+ return None
27
+
28
+ # Streamlit app for downloading files
29
+ st.title("Download Datasets from Database")
30
+ st.write("Below is the list of all datasets available in the database. Select and download any file.")
31
+
32
+ datasets = get_datasets()
33
+
34
+ if datasets:
35
+ for dataset_id, dataset_name in datasets:
36
+ st.write(f"**Dataset ID**: {dataset_id} | **Name**: {dataset_name}")
37
+ dataset_content = load_dataset_content(dataset_id)
38
+
39
+ if dataset_content:
40
+ st.download_button(
41
+ label=f"Download {dataset_name}",
42
+ data=dataset_content,
43
+ file_name=dataset_name,
44
+ mime="text/csv"
45
+ )
46
+ else:
47
+ st.write("No datasets available in the database.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ seaborn
4
+ matplotlib
5
+ scikit-learn
6
+ nbformat
7
+ sqlite3