File size: 4,027 Bytes
95be363
 
1c11299
95be363
1c11299
95be363
1c11299
 
 
 
 
 
95be363
1c11299
 
 
 
 
 
 
 
95be363
1c11299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95be363
 
1c11299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95be363
 
1c11299
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import streamlit as st
import pandas as pd
import plotly.express as px
from ydata_profiling import ProfileReport
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 1. Set Page Configuration
st.set_page_config(
    page_title="Enhanced Data Profiling",
    layout="wide",
    page_icon="📊"
)

# 2. Custom CSS for a Clean, White UI
custom_css = """
<style>
/* Make the entire background white */
body {
    background-color: #ffffff !important;
    font-family: 'Roboto', sans-serif;
}

/* Headers and titles */
h1, h2, h3, h4 {
    color: #2c3e50;
    font-weight: 700;
}

/* The main Streamlit container */
[data-testid="stAppViewContainer"] {
    background-color: #ffffff !important;
}

/* Individual content containers */
.css-1d391kg, .css-hxt7ib {
    background-color: #ffffff !important;
    border-radius: 15px;
    padding: 30px;
    margin-bottom: 20px;
    box-shadow: 0 8px 16px rgba(0,0,0,0.1);
}

/* Sidebar styling */
[data-testid="stSidebar"] {
    background-color: #34495e !important;
    color: #ecf0f1 !important;
    font-size: 16px;
}
[data-testid="stSidebar"] .css-1d391kg {
    background-color: #2c3e50 !important;
    border-radius: 10px;
}
</style>
"""
st.markdown(custom_css, unsafe_allow_html=True)

# 3. Title and Description
st.title("Enhanced Data Profiling")
st.markdown("<h4 style='text-align: center; color: #2c3e50;'>Upload your CSV and explore it thoroughly!</h4>", unsafe_allow_html=True)

# 4. Sidebar for File Upload
st.sidebar.header("Upload & Options")
uploaded_file = st.sidebar.file_uploader("Upload a CSV file", type="csv")

# Placeholder for the DataFrame
df = None

if uploaded_file is not None:
    # 4a. Read the CSV
    df = pd.read_csv(uploaded_file)
    st.success("File uploaded successfully!")

    # 5. KPI Metrics / Quick Summary
    st.subheader("Dataset Quick Summary")
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Rows", f"{df.shape[0]}")
    col2.metric("Columns", f"{df.shape[1]}")
    missing_percentage = (df.isnull().sum().sum() / df.size) * 100
    col3.metric("Missing %", f"{missing_percentage:.2f}%")
    duplicates = df.duplicated().sum()
    col4.metric("Duplicates", f"{duplicates}")

    st.write("---")

    # 6. Optional Data Transformation: Drop columns with > 50% missing
    if st.checkbox("Drop columns with > 50% missing data?"):
        threshold = df.shape[0] * 0.5
        before_cols = df.shape[1]
        df = df.loc[:, df.isnull().sum() < threshold]
        after_cols = df.shape[1]
        st.success(f"Dropped {before_cols - after_cols} columns. Remaining columns: {after_cols}")

    # 7. Optional Quick Histogram
    numeric_cols = df.select_dtypes(include="number").columns.tolist()
    if numeric_cols:
        st.subheader("Optional Quick Histogram")
        selected_col = st.selectbox("Select a numeric column", numeric_cols)
        if selected_col:
            fig_hist = px.histogram(df, x=selected_col, nbins=50, title=f"Histogram of {selected_col}")
            fig_hist.update_traces(opacity=0.8)
            st.plotly_chart(fig_hist, use_container_width=True)

    # 8. Generate ydata-profiling Report
    st.subheader("Comprehensive Profiling Report")
    with st.spinner("Generating profiling report..."):
        profile = ProfileReport(df, title="Profiling Report", explorative=True)
        report_html = profile.to_html()

    # 8a. Display the report in an iframe
    st.components.v1.html(report_html, height=1200, scrolling=True)

    # 8b. Download Button for HTML
    st.write("### Download the Profiling Report")
    st.download_button(
        label="Download HTML",
        data=report_html.encode('utf-8'),
        file_name="profiling_report.html",
        mime="text/html"
    )
else:
    st.info("Awaiting CSV file upload.")

# That's it! 
# Simply copy and paste this into your app.py on Hugging Face Spaces.
# Make sure you have a requirements.txt that includes:
#   streamlit
#   pandas
#   ydata-profiling
#   plotly
#   statsmodels (for VIF, if you need it)