Spaces:

mdik1
/

Automatic-Data-Science

Sleeping

App Files Files Community

mdik1 commited on Jan 1

Commit

b80a552

verified ·

1 Parent(s): 4c0f415

Upload 4 files

Browse files

Files changed (4) hide show

README.md +12 -12
app.py +227 -0
download_files_app.py +47 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: Automatic Data Science
-emoji: 🔥
-colorFrom: yellow
-colorTo: gray
-sdk: streamlit
-sdk_version: 1.41.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Automatic Data Science
+emoji: 🔥
+colorFrom: yellow
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.41.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, accuracy_score
+import nbformat as nbf
+import io
+import sqlite3
+from io import StringIO
+import os
+# Constants
+DB_PATH = "db/database.db"
+TEMP_DIR = "temp/"
+# Ensure directories exist
+os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
+os.makedirs(TEMP_DIR, exist_ok=True)
+# Initialize SQLite database
+def init_db():
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS datasets (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            content TEXT NOT NULL
+        )
+    """)
+    conn.commit()
+    conn.close()
+# Save dataset to SQLite
+def save_dataset_to_db(name, content):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("INSERT INTO datasets (name, content) VALUES (?, ?)", (name, content))
+    conn.commit()
+    conn.close()
+# Fetch all datasets from SQLite
+def get_datasets():
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT id, name FROM datasets")
+    datasets = cursor.fetchall()
+    conn.close()
+    return datasets
+# Load dataset by ID
+def load_dataset_from_db(dataset_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
+    content = cursor.fetchone()
+    conn.close()
+    if content:
+        return StringIO(content[0])
+    return None
+# Initialize database
+init_db()
+# Function to detect problem type
+def detect_problem_type(df, target_column):
+    if target_column not in df.columns:
+        return "Error: Target column not found in the dataset."
+    df_clean = df.dropna(subset=[target_column])
+    unique_values = df_clean[target_column].nunique()
+    if unique_values == 2:
+        return "binary_classification"
+    elif unique_values > 2:
+        return "multiclass_classification"
+    else:
+        return "Error: Invalid target column (not enough unique values)."
+# Function to generate notebook content
+def generate_notebook_code(csv_path, target_column, problem_type):
+    notebook = nbf.v4.new_notebook()
+    code = f"""
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, accuracy_score
+# Load Dataset
+df = pd.read_csv("{csv_path}")
+target_column = "{target_column}"
+# Display the first few rows
+print(df.head())
+# Check for missing values
+print("Missing Values:\\n", df.isnull().sum())
+# Encode categorical columns
+categorical_cols = df.select_dtypes(include=['object']).columns
+for col in categorical_cols:
+    df[col] = LabelEncoder().fit_transform(df[col])
+# Fill missing values with median
+df.fillna(df.median(), inplace=True)
+# Split data into features and target
+X = df.drop(columns=[target_column])
+y = df[target_column]
+# Standardize numeric columns
+scaler = StandardScaler()
+X = scaler.fit_transform(X)
+# Train/Test Split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Train Model
+model = RandomForestClassifier() if "{problem_type}" in ["binary_classification", "multiclass_classification"] else None
+model.fit(X_train, y_train)
+# Predict and Evaluate
+y_pred = model.predict(X_test)
+print("Accuracy Score:", accuracy_score(y_test, y_pred))
+print("Classification Report:\\n", classification_report(y_test, y_pred))
+    """
+    notebook.cells.append(nbf.v4.new_code_cell(code))
+    return notebook
+# Streamlit app
+st.title("Automated Data Science App")
+st.write("Upload a CSV file and specify the target column to automatically process and train models.")
+# File upload
+uploaded_file = st.file_uploader("Upload your CSV file", type="csv")
+target_column = st.text_input("Enter the target column name")
+if uploaded_file and target_column:
+    try:
+        df = pd.read_csv(uploaded_file)
+        st.write("Dataset Preview:")
+        st.write(df.head())
+        st.subheader("Missing Values")
+        st.write(df.isnull().sum())
+        st.subheader("Basic Statistics")
+        st.write(df.describe())
+        problem_type = detect_problem_type(df, target_column)
+        if "Error" in problem_type:
+            st.error(problem_type)
+        else:
+            st.write(f"Detected Problem Type: {problem_type}")
+            # Save dataset to database
+            save_dataset_to_db(uploaded_file.name, uploaded_file.getvalue().decode("utf-8"))
+            categorical_cols = df.select_dtypes(include=['object']).columns
+            for col in categorical_cols:
+                df[col] = LabelEncoder().fit_transform(df[col])
+            df.fillna(df.median(), inplace=True)
+            X = df.drop(columns=[target_column])
+            y = df[target_column]
+            scaler = StandardScaler()
+            X = scaler.fit_transform(X)
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+            model = RandomForestClassifier()
+            model.fit(X_train, y_train)
+          # Evaluate model
+            y_pred = model.predict(X_test)
+            st.subheader("Model Performance")
+            st.write("Accuracy:", accuracy_score(y_test, y_pred))
+            # Display the classification report with proper formatting
+            st.write("Classification Report:")
+            report = classification_report(y_test, y_pred)
+            st.code(report)  # st.text ensures the report is displayed with proper formatting
+            feature_importances = model.feature_importances_
+            important_features = pd.Series(feature_importances, index=df.drop(columns=[target_column]).columns)
+            important_features = important_features.sort_values(ascending=False).head(5)
+            st.subheader("Important Features")
+            st.write(important_features)
+            st.subheader("Visualizations")
+            for feature in important_features.index:
+                st.write(f"Box Plot for {feature}")
+                fig, ax = plt.subplots(figsize=(8, 6))
+                sns.boxplot(x=y, y=df[feature], ax=ax)
+                st.pyplot(fig)
+                st.write(f"Histogram for {feature}")
+                fig, ax = plt.subplots(figsize=(8, 6))
+                sns.histplot(df[feature], kde=True, bins=30, ax=ax)
+                st.pyplot(fig)
+            temp_csv_path = os.path.join(TEMP_DIR, uploaded_file.name)
+            with open(temp_csv_path, "w") as f:
+                f.write(uploaded_file.getvalue().decode("utf-8"))
+            notebook = generate_notebook_code(temp_csv_path, target_column, problem_type)
+            notebook_buffer = io.StringIO()
+            nbf.write(notebook, notebook_buffer)
+            notebook_buffer.seek(0)
+            notebook_content = notebook_buffer.getvalue()
+            st.download_button(
+                label="Download Code Notebook",
+                data=notebook_content,
+                file_name="data_science_pipeline.ipynb",
+                mime="application/json"
+            )
+    except Exception as e:
+        st.error(f"An error occurred: {e}")

download_files_app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import streamlit as st
+import sqlite3
+from io import StringIO
+# Constants
+DB_PATH = "db/database.db"
+# Fetch all datasets from SQLite
+def get_datasets():
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT id, name FROM datasets")
+    datasets = cursor.fetchall()
+    conn.close()
+    return datasets
+# Load dataset content by ID
+def load_dataset_content(dataset_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT content FROM datasets WHERE id = ?", (dataset_id,))
+    content = cursor.fetchone()
+    conn.close()
+    if content:
+        return content[0]
+    return None
+# Streamlit app for downloading files
+st.title("Download Datasets from Database")
+st.write("Below is the list of all datasets available in the database. Select and download any file.")
+datasets = get_datasets()
+if datasets:
+    for dataset_id, dataset_name in datasets:
+        st.write(f"**Dataset ID**: {dataset_id} | **Name**: {dataset_name}")
+        dataset_content = load_dataset_content(dataset_id)
+        if dataset_content:
+            st.download_button(
+                label=f"Download {dataset_name}",
+                data=dataset_content,
+                file_name=dataset_name,
+                mime="text/csv"
+            )
+else:
+    st.write("No datasets available in the database.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+pandas
+seaborn
+matplotlib
+scikit-learn
+nbformat
+sqlite3