Spaces:
Sleeping
Sleeping
File size: 8,726 Bytes
da0e299 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import nbformat as nbf
import io
import sqlite3
from io import StringIO
import os
# Constants
DB_PATH = "db/database.db"
TEMP_DIR = "temp/"
# Ensure directories exist
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
# Initialize SQLite database
def init_db():
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS datasets (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
content TEXT NOT NULL
)
""")
conn.commit()
conn.close()
# Save dataset to SQLite
def save_dataset_to_db(name, content):
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
conn.commit()
conn.close()
# Fetch all datasets from SQLite
def get_datasets():
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT id, name FROM datasets")
datasets = cursor.fetchall()
conn.close()
return datasets
# Load dataset by ID
def load_dataset_from_db(dataset_id):
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
content = cursor.fetchone()
conn.close()
if content:
return StringIO(content[0])
return None
# Initialize database
init_db()
# Function to detect problem type
def detect_problem_type(df, target_column):
if target_column not in df.columns:
return "Error: Target column not found in the dataset."
df_clean = df.dropna(subset=[target_column])
unique_values = df_clean[target_column].nunique()
if unique_values == 2:
return "binary_classification"
elif unique_values > 2:
return "multiclass_classification"
else:
return "Error: Invalid target column (not enough unique values)."
# Function to generate notebook content
def generate_notebook_code(csv_path, target_column, problem_type):
notebook = nbf.v4.new_notebook()
code = f"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
# Load Dataset
df = pd.read_csv("{csv_path}")
target_column = "{target_column}"
# Display the first few rows
print(df.head())
# Check for missing values
print("Missing Values:\\n", df.isnull().sum())
# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
df[col] = LabelEncoder().fit_transform(df[col])
# Fill missing values with median
df.fillna(df.median(), inplace=True)
# Split data into features and target
X = df.drop(columns=[target_column])
y = df[target_column]
# Standardize numeric columns
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Models
models = []
if "{problem_type}" in ["binary_classification", "multiclass_classification"]:
models.append(("Random Forest", RandomForestClassifier()))
models.append(("Logistic Regression", LogisticRegression()))
models.append(("SVM", SVC()))
models.append(("Decision Tree", DecisionTreeClassifier()))
# Model Evaluation
results = []
for model_name, model in models:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results.append((model_name, accuracy))
print("Model Performance:")
for model_name, accuracy in results:
print(f"{model_name}: {accuracy}")
"""
notebook.cells.append(nbf.v4.new_code_cell(code))
return notebook
# Streamlit app
st.title("Automated Data Science App")
st.write("Upload a CSV file and specify the target column to automatically process and train models.")
# File upload
uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
target_column = st.text_input("Enter the target column name")
if uploaded_file and target_column:
try:
df = pd.read_csv(uploaded_file)
st.write("Dataset Preview:")
st.write(df.head())
st.subheader("Missing Values")
st.write(df.isnull().sum())
st.subheader("Basic Statistics")
st.write(df.describe())
problem_type = detect_problem_type(df, target_column)
if "Error" in problem_type:
st.error(problem_type)
else:
st.write(f"Detected Problem Type: {problem_type}")
# Save dataset to database
save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
df[col] = LabelEncoder().fit_transform(df[col])
df.fillna(df.median(), inplace=True)
X = df.drop(columns=[target_column])
y = df[target_column]
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = [
("Random Forest", RandomForestClassifier()),
("Logistic Regression", LogisticRegression()),
("SVM", SVC()),
("Decision Tree", DecisionTreeClassifier())
]
results = []
for model_name, model in models:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
results.append((model_name, accuracy))
# Display results in a table
st.subheader("Model Performance")
results_df = pd.DataFrame(results, columns=["Model Name", "Accuracy"])
st.write(results_df)
# Display the classification report with proper formatting
st.subheader("Classification Report")
report = classification_report(y_test, y_pred)
st.code(report) # st.text ensures the report is displayed with proper formatting
feature_importances = model.feature_importances_ if hasattr(model, "feature_importances_") else None
if feature_importances is not None:
important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
important_features = important_features.sort_values(ascending=False).head(5)
st.subheader("Important Features")
st.write(important_features)
st.subheader("Visualizations")
for feature in important_features.index:
st.write(f"Box Plot for {feature}")
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=y, y=df[feature], ax=ax)
st.pyplot(fig)
st.write(f"Histogram for {feature}")
fig, ax = plt.subplots(figsize=(8, 6))
sns.histplot(df[feature], kde=True, bins=30, ax=ax)
st.pyplot(fig)
temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
with open(temp_csv_path, "w") as f:
f.write(uploaded_file.getvalue().decode("utf-8"))
notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
notebook_buffer = io.StringIO()
nbf.write(notebook, notebook_buffer)
notebook_buffer.seek(0)
notebook_content = notebook_buffer.getvalue()
st.download_button(
label="Download Code Notebook",
data=notebook_content,
file_name="data_science_pipeline.ipynb",
mime="application/json"
)
except Exception as e:
st.error(f"An error occurred: {e}")
|