Spaces:
Build error
Build error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import os | |
from utils import ( | |
load_dataset, | |
save_dataset, | |
clean_dataset, | |
compute_dataset_score, | |
detect_outliers, | |
apply_transformation, | |
list_datasets, | |
detect_inconsistent_types | |
) | |
# ------------------------------- | |
# Constants & Setup | |
# ------------------------------- | |
DATASET_DIR = "datasets" | |
DEFAULT_DATASET = "train_data.csv" | |
os.makedirs(DATASET_DIR, exist_ok=True) # Ensure directory exists | |
# ------------------------------- | |
# Sidebar: Dataset Selection | |
# ------------------------------- | |
st.sidebar.header("π Dataset Selection") | |
# List available datasets from the datasets folder | |
available_datasets = list_datasets(DATASET_DIR) | |
dataset_choice = st.sidebar.radio("Choose Dataset Source:", ["Select Existing Dataset", "Upload New Dataset"]) | |
dataset_path = None | |
if dataset_choice == "Select Existing Dataset": | |
if available_datasets: | |
selected_dataset = st.sidebar.selectbox("Select Dataset:", available_datasets) | |
dataset_path = os.path.join(DATASET_DIR, selected_dataset) | |
st.sidebar.success(f"Using `{selected_dataset}` dataset.") | |
else: | |
st.sidebar.warning("No datasets found. Please upload a new dataset.") | |
elif dataset_choice == "Upload New Dataset": | |
uploaded_file = st.sidebar.file_uploader("Upload Dataset (CSV, JSON, or Excel)", type=["csv", "json", "xlsx"]) | |
if uploaded_file: | |
file_ext = uploaded_file.name.split('.')[-1].lower() | |
try: | |
if file_ext == "csv": | |
new_df = pd.read_csv(uploaded_file) | |
elif file_ext == "json": | |
new_df = pd.json_normalize(json.load(uploaded_file)) | |
elif file_ext == "xlsx": | |
new_df = pd.read_excel(uploaded_file) | |
else: | |
st.error("Unsupported file format.") | |
st.stop() | |
except Exception as e: | |
st.error(f"Error reading file: {e}") | |
st.stop() | |
# Save the new dataset with its filename | |
dataset_path = os.path.join(DATASET_DIR, uploaded_file.name) | |
save_dataset(new_df, dataset_path) | |
st.sidebar.success(f"Dataset `{uploaded_file.name}` uploaded successfully!") | |
available_datasets = list_datasets(DATASET_DIR) # Refresh list | |
else: | |
st.sidebar.warning("Please upload a dataset.") | |
# ------------------------------- | |
# Load the Selected Dataset | |
# ------------------------------- | |
if dataset_path: | |
df = load_dataset(dataset_path) | |
if df.empty: | |
st.warning("Dataset is empty or failed to load.") | |
else: | |
df = pd.DataFrame() | |
st.warning("No dataset selected. Please choose or upload a dataset.") | |
# ------------------------------- | |
# Main App Title & Description | |
# ------------------------------- | |
st.title("π The Data Hub") | |
# ------------------------------- | |
# Tabs for Operations | |
# ------------------------------- | |
tabs = st.tabs([ | |
"View & Summary", "Clean Data", | |
"Visualize Data", "Data Profiling", | |
"Outlier Detection", "Custom Transformations", | |
"Export" | |
]) | |
# ------------------------------- | |
# Tab 1: View & Summary | |
# ------------------------------- | |
with tabs[0]: | |
st.subheader("π Current Dataset Preview") | |
if not df.empty: | |
st.dataframe(df) | |
st.markdown("#### π Basic Statistics") | |
st.write(df.describe(include="all")) | |
else: | |
st.warning("No dataset available. Please choose or upload a dataset.") | |
# ------------------------------- | |
# Tab 2: Clean Data | |
# ------------------------------- | |
with tabs[1]: | |
st.subheader("π§Ό Clean Your Dataset") | |
if not df.empty: | |
remove_duplicates = st.checkbox("Remove Duplicate Rows", value=True) | |
fill_missing = st.checkbox("Fill Missing Values", value=False) | |
fill_value = st.text_input("Fill missing values with:", value="0") | |
st.markdown("#### Optional: Rename Columns") | |
new_names = {} | |
for col in df.columns: | |
new_names[col] = st.text_input(f"Rename column '{col}'", value=col) | |
if st.button("Clean Dataset"): | |
cleaned_df = clean_dataset(df, remove_duplicates, fill_missing, fill_value) | |
cleaned_df = cleaned_df.rename(columns=new_names) | |
save_dataset(cleaned_df, dataset_path) | |
st.success("β Dataset cleaned successfully!") | |
st.dataframe(cleaned_df.head()) | |
df = cleaned_df | |
else: | |
st.warning("No dataset available for cleaning.") | |
# ------------------------------- | |
# Tab 3: Visualize Data (Fixed KeyError Issue) | |
# ------------------------------- | |
with tabs[2]: | |
st.subheader("π Visualize Your Data") | |
if not df.empty: | |
viz_type = st.selectbox("Select Visualization Type", ["Histogram", "Scatter", "Box Plot", "Heatmap", "Line Chart"]) | |
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist() | |
if numeric_cols: | |
# Validate column selection | |
col = st.selectbox("Select Column", numeric_cols) | |
if col: # Ensure valid column selection | |
fig, ax = plt.subplots() | |
if viz_type == "Histogram": | |
ax.hist(df[col].dropna(), bins=20, color="skyblue", edgecolor="black") | |
elif viz_type == "Box Plot": | |
sns.boxplot(x=df[col].dropna(), ax=ax) | |
elif viz_type == "Scatter": | |
x_col = st.selectbox("X-axis", numeric_cols) | |
y_col = st.selectbox("Y-axis", numeric_cols) | |
if x_col and y_col: | |
ax.scatter(df[x_col], df[y_col], color="green") | |
elif viz_type == "Heatmap": | |
corr = df[numeric_cols].corr() | |
sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax) | |
elif viz_type == "Line Chart": | |
ax.plot(df.index, df[col], marker="o") | |
st.pyplot(fig) | |
else: | |
st.warning("Please select a valid column.") | |
else: | |
st.warning("No numeric columns available for visualization.") | |
else: | |
st.warning("No dataset available for visualization.") | |
# ------------------------------- | |
# Tab 4: Data Profiling | |
# ------------------------------- | |
with tabs[3]: | |
if not df.empty: | |
# ------------------------------- | |
# 1. General Dataset Info | |
# ------------------------------- | |
st.markdown("### π οΈ General Information") | |
st.write(f"β **Total Rows:** `{df.shape[0]}`") | |
st.write(f"β **Total Columns:** `{df.shape[1]}`") | |
st.write(f"β **Memory Usage:** `{df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB`") | |
st.write(f"β **Dataset Shape:** `{df.shape}`") | |
# ------------------------------- | |
# 2. Dataset Quality Score | |
# ------------------------------- | |
st.markdown("### π Dataset Quality Score") | |
score = compute_dataset_score(df) | |
st.success(f"π― Dataset Quality Score: `{score} / 100`") | |
# ------------------------------- | |
# 3. Column Overview with Stats | |
# ------------------------------- | |
st.markdown("### π₯ Column Overview") | |
# Numeric and categorical columns | |
numeric_cols = df.select_dtypes(include=["number"]).columns | |
categorical_cols = df.select_dtypes(include=["object"]).columns | |
profile = pd.DataFrame({ | |
"Column": df.columns, | |
"Data Type": df.dtypes.values, | |
"Missing Values": df.isnull().sum().values, | |
"Missing %": (df.isnull().sum() / len(df) * 100).values, | |
"Unique Values": df.nunique().values | |
}) | |
# Add numeric statistics | |
if len(numeric_cols) > 0: | |
numeric_stats = pd.DataFrame({ | |
"Column": numeric_cols, | |
"Min": df[numeric_cols].min().values, | |
"Max": df[numeric_cols].max().values, | |
"Mean": df[numeric_cols].mean().values, | |
"Std Dev": df[numeric_cols].std().values, | |
"Skewness": df[numeric_cols].skew().values, | |
"Kurtosis": df[numeric_cols].kurt().values | |
}) | |
# Merge stats with the profile | |
profile = profile.merge(numeric_stats, on="Column", how="left") | |
st.dataframe(profile) | |
# ------------------------------- | |
# 4. Missing Values Visualization | |
# ------------------------------- | |
st.markdown("### π Missing Values Distribution") | |
missing_values = df.isnull().sum() | |
missing_values = missing_values[missing_values > 0] | |
if not missing_values.empty: | |
fig, ax = plt.subplots(figsize=(12, 5)) | |
sns.barplot(x=missing_values.index, y=missing_values.values, ax=ax, color="skyblue") | |
ax.set_title("Missing Values per Column") | |
ax.set_ylabel("Missing Count") | |
ax.set_xticklabels(ax.get_xticklabels(), rotation=45) | |
st.pyplot(fig) | |
else: | |
st.success("No missing values found!") | |
# ------------------------------- | |
# 5. Duplicates Detection | |
# ------------------------------- | |
st.markdown("### π₯ Duplicates & Constant Columns Detection") | |
# Duplicates | |
duplicate_count = df.duplicated().sum() | |
st.write(f"π **Duplicate Rows:** `{duplicate_count}`") | |
# Constant Columns | |
constant_cols = [col for col in df.columns if df[col].nunique() == 1] | |
if constant_cols: | |
st.write(f"π© **Constant Columns:** `{constant_cols}`") | |
else: | |
st.success("No constant columns detected!") | |
# ------------------------------- | |
# 6. Cardinality Analysis | |
# ------------------------------- | |
st.markdown("### 𧬠Cardinality Analysis") | |
high_cardinality = [col for col in df.columns if df[col].nunique() > len(df) * 0.8] | |
if high_cardinality: | |
st.write(f"π’ **High-Cardinality Columns:** `{high_cardinality}`") | |
else: | |
st.success("No high-cardinality columns detected!") | |
# ------------------------------- | |
# 7. Top Frequent & Rare Values | |
# ------------------------------- | |
st.markdown("### π― Frequent & Rare Values") | |
for col in categorical_cols: | |
st.write(f"β **{col}**") | |
top_values = df[col].value_counts().nlargest(5) | |
rare_values = df[col].value_counts().nsmallest(5) | |
st.write("π **Top Frequent Values:**") | |
st.dataframe(top_values) | |
st.write("π§ͺ **Rare Values:**") | |
st.dataframe(rare_values) | |
# ------------------------------- | |
# 8. Correlation Matrix | |
# ------------------------------- | |
st.markdown("### π Correlation Matrix") | |
if len(numeric_cols) > 1: | |
corr = df[numeric_cols].corr() | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, ax=ax) | |
st.pyplot(fig) | |
else: | |
st.info("Not enough numeric columns for correlation analysis.") | |
# ------------------------------- | |
# 9. Pair Plot (Numerical Relationships) | |
# ------------------------------- | |
st.markdown("### π₯ Pair Plot (Numerical Relationships)") | |
if len(numeric_cols) >= 2: | |
pairplot = sns.pairplot(df[numeric_cols], diag_kind='kde') | |
st.pyplot(pairplot.fig) | |
else: | |
st.info("Not enough numeric columns for pair plot visualization.") | |
# ------------------------------- | |
# 10. Outlier Detection | |
# ------------------------------- | |
st.markdown("### π© Outlier Detection") | |
outliers = detect_outliers(df) | |
if outliers: | |
st.write("β **Outliers Detected:**") | |
st.dataframe(pd.DataFrame(outliers.items(), columns=["Column", "Outlier Count"])) | |
else: | |
st.success("No significant outliers detected!") | |
# ------------------------------- | |
# 11. Inconsistent Data Types | |
# ------------------------------- | |
st.markdown("### π« Inconsistent Data Types") | |
inconsistent_types = detect_inconsistent_types(df) | |
if inconsistent_types: | |
st.write("β οΈ **Inconsistent Data Types Detected:**") | |
st.write(inconsistent_types) | |
else: | |
st.success("No inconsistent data types detected!") | |
else: | |
st.warning("No dataset available for profiling.") | |
# ------------------------------- | |
# Tab 5: Outlier Detection | |
# ------------------------------- | |
with tabs[4]: | |
st.subheader("π Outlier Detection") | |
if not df.empty: | |
outliers = detect_outliers(df) | |
st.write(outliers) | |
else: | |
st.warning("No dataset available for outlier detection.") | |
# ------------------------------- | |
# Tab 6: Export | |
# ------------------------------- | |
with tabs[5]: | |
st.subheader("π€ Export Dataset") | |
export_format = st.selectbox("Export Format", ["CSV", "Excel", "JSON"]) | |
if not df.empty: | |
st.download_button("Download", df.to_csv(index=False), f"dataset.{export_format.lower()}") | |