Spaces:

mdik1
/

Automatic-Data-Science

Sleeping

App Files Files Community

mdik1 commited on Jan 14

Commit

da0e299

verified ·

1 Parent(s): 4025460

Update app.py

Browse files

Files changed (1) hide show

app.py +256 -227

app.py CHANGED Viewed

@@ -1,227 +1,256 @@
-import streamlit as st
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler, LabelEncoder
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report, accuracy_score
-import nbformat as nbf
-import io
-import sqlite3
-from io import StringIO
-import os
-# Constants
-DB_PATH = "db/database.db"
-TEMP_DIR = "temp/"
-# Ensure directories exist
-os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
-os.makedirs(TEMP_DIR, exist_ok=True)
-# Initialize SQLite database
-def init_db():
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("""
-        CREATE TABLE IF NOT EXISTS datasets (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            name TEXT NOT NULL,
-            content TEXT NOT NULL
-        )
-    """)
-    conn.commit()
-    conn.close()
-# Save dataset to SQLite
-def save_dataset_to_db(name, content):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
-    conn.commit()
-    conn.close()
-# Fetch all datasets from SQLite
-def get_datasets():
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("SELECT id, name FROM datasets")
-    datasets = cursor.fetchall()
-    conn.close()
-    return datasets
-# Load dataset by ID
-def load_dataset_from_db(dataset_id):
-    conn = sqlite3.connect(DB_PATH)
-    cursor = conn.cursor()
-    cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
-    content = cursor.fetchone()
-    conn.close()
-    if content:
-        return StringIO(content[0])
-    return None
-# Initialize database
-init_db()
-# Function to detect problem type
-def detect_problem_type(df, target_column):
-    if target_column not in df.columns:
-        return "Error: Target column not found in the dataset."
-    df_clean = df.dropna(subset=[target_column])
-    unique_values = df_clean[target_column].nunique()
-    if unique_values == 2:
-        return "binary_classification"
-    elif unique_values > 2:
-        return "multiclass_classification"
-    else:
-        return "Error: Invalid target column (not enough unique values)."
-# Function to generate notebook content
-def generate_notebook_code(csv_path, target_column, problem_type):
-    notebook = nbf.v4.new_notebook()
-    code = f"""
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler, LabelEncoder
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report, accuracy_score
-# Load Dataset
-df = pd.read_csv("{csv_path}")
-target_column = "{target_column}"
-# Display the first few rows
-print(df.head())
-# Check for missing values
-print("Missing Values:\\n", df.isnull().sum())
-# Encode categorical columns
-categorical_cols = df.select_dtypes(include=['object']).columns
-for col in categorical_cols:
-    df[col] = LabelEncoder().fit_transform(df[col])
-# Fill missing values with median
-df.fillna(df.median(), inplace=True)
-# Split data into features and target
-X = df.drop(columns=[target_column])
-y = df[target_column]
-# Standardize numeric columns
-scaler = StandardScaler()
-X = scaler.fit_transform(X)
-# Train/Test Split
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-# Train Model
-model = RandomForestClassifier() if "{problem_type}" in ["binary_classification", "multiclass_classification"] else None
-model.fit(X_train, y_train)
-# Predict and Evaluate
-y_pred = model.predict(X_test)
-print("Accuracy Score:", accuracy_score(y_test, y_pred))
-print("Classification Report:\\n", classification_report(y_test, y_pred))
-    """
-    notebook.cells.append(nbf.v4.new_code_cell(code))
-    return notebook
-# Streamlit app
-st.title("Automated Data Science App")
-st.write("Upload a CSV file and specify the target column to automatically process and train models.")
-# File upload
-uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
-target_column = st.text_input("Enter the target column name")
-if uploaded_file and target_column:
-    try:
-        df = pd.read_csv(uploaded_file)
-        st.write("Dataset Preview:")
-        st.write(df.head())
-        st.subheader("Missing Values")
-        st.write(df.isnull().sum())
-        st.subheader("Basic Statistics")
-        st.write(df.describe())
-        problem_type = detect_problem_type(df, target_column)
-        if "Error" in problem_type:
-            st.error(problem_type)
-        else:
-            st.write(f"Detected Problem Type: {problem_type}")
-            # Save dataset to database
-            save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
-            categorical_cols = df.select_dtypes(include=['object']).columns
-            for col in categorical_cols:
-                df[col] = LabelEncoder().fit_transform(df[col])
-            df.fillna(df.median(), inplace=True)
-            X = df.drop(columns=[target_column])
-            y = df[target_column]
-            scaler = StandardScaler()
-            X = scaler.fit_transform(X)
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-            model = RandomForestClassifier()
-            model.fit(X_train, y_train)
-          # Evaluate model
-            y_pred = model.predict(X_test)
-            st.subheader("Model Performance")
-            st.write("Accuracy:", accuracy_score(y_test, y_pred))
-            # Display the classification report with proper formatting
-            st.write("Classification Report:")
-            report = classification_report(y_test, y_pred)
-            st.code(report)  # st.text ensures the report is displayed with proper formatting
-            feature_importances = model.feature_importances_
-            important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
-            important_features = important_features.sort_values(ascending=False).head(5)
-            st.subheader("Important Features")
-            st.write(important_features)
-            st.subheader("Visualizations")
-            for feature in important_features.index:
-                st.write(f"Box Plot for {feature}")
-                fig, ax = plt.subplots(figsize=(8, 6))
-                sns.boxplot(x=y, y=df[feature], ax=ax)
-                st.pyplot(fig)
-                st.write(f"Histogram for {feature}")
-                fig, ax = plt.subplots(figsize=(8, 6))
-                sns.histplot(df[feature], kde=True, bins=30, ax=ax)
-                st.pyplot(fig)
-            temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
-            with open(temp_csv_path, "w") as f:
-                f.write(uploaded_file.getvalue().decode("utf-8"))
-            notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
-            notebook_buffer = io.StringIO()
-            nbf.write(notebook, notebook_buffer)
-            notebook_buffer.seek(0)
-            notebook_content = notebook_buffer.getvalue()
-            st.download_button(
-                label="Download Code Notebook",
-                data=notebook_content,
-                file_name="data_science_pipeline.ipynb",
-                mime="application/json"
-            )
-    except Exception as e:
-        st.error(f"An error occurred: {e}")

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import classification_report, accuracy_score
+import nbformat as nbf
+import io
+import sqlite3
+from io import StringIO
+import os
+# Constants
+DB_PATH = "db/database.db"
+TEMP_DIR = "temp/"
+# Ensure directories exist
+os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
+os.makedirs(TEMP_DIR, exist_ok=True)
+# Initialize SQLite database
+def init_db():
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS datasets (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            content TEXT NOT NULL
+        )
+    """)
+    conn.commit()
+    conn.close()
+# Save dataset to SQLite
+def save_dataset_to_db(name, content):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
+    conn.commit()
+    conn.close()
+# Fetch all datasets from SQLite
+def get_datasets():
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT id, name FROM datasets")
+    datasets = cursor.fetchall()
+    conn.close()
+    return datasets
+# Load dataset by ID
+def load_dataset_from_db(dataset_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
+    content = cursor.fetchone()
+    conn.close()
+    if content:
+        return StringIO(content[0])
+    return None
+# Initialize database
+init_db()
+# Function to detect problem type
+def detect_problem_type(df, target_column):
+    if target_column not in df.columns:
+        return "Error: Target column not found in the dataset."
+    df_clean = df.dropna(subset=[target_column])
+    unique_values = df_clean[target_column].nunique()
+    if unique_values == 2:
+        return "binary_classification"
+    elif unique_values > 2:
+        return "multiclass_classification"
+    else:
+        return "Error: Invalid target column (not enough unique values)."
+# Function to generate notebook content
+def generate_notebook_code(csv_path, target_column, problem_type):
+    notebook = nbf.v4.new_notebook()
+    code = f"""
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import classification_report, accuracy_score
+# Load Dataset
+df = pd.read_csv("{csv_path}")
+target_column = "{target_column}"
+# Display the first few rows
+print(df.head())
+# Check for missing values
+print("Missing Values:\\n", df.isnull().sum())
+# Encode categorical columns
+categorical_cols = df.select_dtypes(include=['object']).columns
+for col in categorical_cols:
+    df[col] = LabelEncoder().fit_transform(df[col])
+# Fill missing values with median
+df.fillna(df.median(), inplace=True)
+# Split data into features and target
+X = df.drop(columns=[target_column])
+y = df[target_column]
+# Standardize numeric columns
+scaler = StandardScaler()
+X = scaler.fit_transform(X)
+# Train/Test Split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Train Models
+models = []
+if "{problem_type}" in ["binary_classification", "multiclass_classification"]:
+    models.append(("Random Forest", RandomForestClassifier()))
+    models.append(("Logistic Regression", LogisticRegression()))
+    models.append(("SVM", SVC()))
+    models.append(("Decision Tree", DecisionTreeClassifier()))
+# Model Evaluation
+results = []
+for model_name, model in models:
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    results.append((model_name, accuracy))
+print("Model Performance:")
+for model_name, accuracy in results:
+    print(f"{model_name}: {accuracy}")
+    """
+    notebook.cells.append(nbf.v4.new_code_cell(code))
+    return notebook
+# Streamlit app
+st.title("Automated Data Science App")
+st.write("Upload a CSV file and specify the target column to automatically process and train models.")
+# File upload
+uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
+target_column = st.text_input("Enter the target column name")
+if uploaded_file and target_column:
+    try:
+        df = pd.read_csv(uploaded_file)
+        st.write("Dataset Preview:")
+        st.write(df.head())
+        st.subheader("Missing Values")
+        st.write(df.isnull().sum())
+        st.subheader("Basic Statistics")
+        st.write(df.describe())
+        problem_type = detect_problem_type(df, target_column)
+        if "Error" in problem_type:
+            st.error(problem_type)
+        else:
+            st.write(f"Detected Problem Type: {problem_type}")
+            # Save dataset to database
+            save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
+            categorical_cols = df.select_dtypes(include=['object']).columns
+            for col in categorical_cols:
+                df[col] = LabelEncoder().fit_transform(df[col])
+            df.fillna(df.median(), inplace=True)
+            X = df.drop(columns=[target_column])
+            y = df[target_column]
+            scaler = StandardScaler()
+            X = scaler.fit_transform(X)
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+            models = [
+                ("Random Forest", RandomForestClassifier()),
+                ("Logistic Regression", LogisticRegression()),
+                ("SVM", SVC()),
+                ("Decision Tree", DecisionTreeClassifier())
+            ]
+            results = []
+            for model_name, model in models:
+                model.fit(X_train, y_train)
+                y_pred = model.predict(X_test)
+                accuracy = accuracy_score(y_test, y_pred)
+                results.append((model_name, accuracy))
+            # Display results in a table
+            st.subheader("Model Performance")
+            results_df = pd.DataFrame(results, columns=["Model Name", "Accuracy"])
+            st.write(results_df)
+            # Display the classification report with proper formatting
+            st.subheader("Classification Report")
+            report = classification_report(y_test, y_pred)
+            st.code(report)  # st.text ensures the report is displayed with proper formatting
+            feature_importances = model.feature_importances_ if hasattr(model, "feature_importances_") else None
+            if feature_importances is not None:
+                important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
+                important_features = important_features.sort_values(ascending=False).head(5)
+                st.subheader("Important Features")
+                st.write(important_features)
+                st.subheader("Visualizations")
+                for feature in important_features.index:
+                    st.write(f"Box Plot for {feature}")
+                    fig, ax = plt.subplots(figsize=(8, 6))
+                    sns.boxplot(x=y, y=df[feature], ax=ax)
+                    st.pyplot(fig)
+                    st.write(f"Histogram for {feature}")
+                    fig, ax = plt.subplots(figsize=(8, 6))
+                    sns.histplot(df[feature], kde=True, bins=30, ax=ax)
+                    st.pyplot(fig)
+            temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
+            with open(temp_csv_path, "w") as f:
+                f.write(uploaded_file.getvalue().decode("utf-8"))
+            notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
+            notebook_buffer = io.StringIO()
+            nbf.write(notebook, notebook_buffer)
+            notebook_buffer.seek(0)
+            notebook_content = notebook_buffer.getvalue()
+            st.download_button(
+                label="Download Code Notebook",
+                data=notebook_content,
+                file_name="data_science_pipeline.ipynb",
+                mime="application/json"
+            )
+    except Exception as e:
+        st.error(f"An error occurred: {e}")