File size: 4,027 Bytes
95be363 1c11299 95be363 1c11299 95be363 1c11299 95be363 1c11299 95be363 1c11299 95be363 1c11299 95be363 1c11299 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import streamlit as st
import pandas as pd
import plotly.express as px
from ydata_profiling import ProfileReport
from statsmodels.stats.outliers_influence import variance_inflation_factor
# 1. Set Page Configuration
st.set_page_config(
page_title="Enhanced Data Profiling",
layout="wide",
page_icon="📊"
)
# 2. Custom CSS for a Clean, White UI
custom_css = """
<style>
/* Make the entire background white */
body {
background-color: #ffffff !important;
font-family: 'Roboto', sans-serif;
}
/* Headers and titles */
h1, h2, h3, h4 {
color: #2c3e50;
font-weight: 700;
}
/* The main Streamlit container */
[data-testid="stAppViewContainer"] {
background-color: #ffffff !important;
}
/* Individual content containers */
.css-1d391kg, .css-hxt7ib {
background-color: #ffffff !important;
border-radius: 15px;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 8px 16px rgba(0,0,0,0.1);
}
/* Sidebar styling */
[data-testid="stSidebar"] {
background-color: #34495e !important;
color: #ecf0f1 !important;
font-size: 16px;
}
[data-testid="stSidebar"] .css-1d391kg {
background-color: #2c3e50 !important;
border-radius: 10px;
}
</style>
"""
st.markdown(custom_css, unsafe_allow_html=True)
# 3. Title and Description
st.title("Enhanced Data Profiling")
st.markdown("<h4 style='text-align: center; color: #2c3e50;'>Upload your CSV and explore it thoroughly!</h4>", unsafe_allow_html=True)
# 4. Sidebar for File Upload
st.sidebar.header("Upload & Options")
uploaded_file = st.sidebar.file_uploader("Upload a CSV file", type="csv")
# Placeholder for the DataFrame
df = None
if uploaded_file is not None:
# 4a. Read the CSV
df = pd.read_csv(uploaded_file)
st.success("File uploaded successfully!")
# 5. KPI Metrics / Quick Summary
st.subheader("Dataset Quick Summary")
col1, col2, col3, col4 = st.columns(4)
col1.metric("Rows", f"{df.shape[0]}")
col2.metric("Columns", f"{df.shape[1]}")
missing_percentage = (df.isnull().sum().sum() / df.size) * 100
col3.metric("Missing %", f"{missing_percentage:.2f}%")
duplicates = df.duplicated().sum()
col4.metric("Duplicates", f"{duplicates}")
st.write("---")
# 6. Optional Data Transformation: Drop columns with > 50% missing
if st.checkbox("Drop columns with > 50% missing data?"):
threshold = df.shape[0] * 0.5
before_cols = df.shape[1]
df = df.loc[:, df.isnull().sum() < threshold]
after_cols = df.shape[1]
st.success(f"Dropped {before_cols - after_cols} columns. Remaining columns: {after_cols}")
# 7. Optional Quick Histogram
numeric_cols = df.select_dtypes(include="number").columns.tolist()
if numeric_cols:
st.subheader("Optional Quick Histogram")
selected_col = st.selectbox("Select a numeric column", numeric_cols)
if selected_col:
fig_hist = px.histogram(df, x=selected_col, nbins=50, title=f"Histogram of {selected_col}")
fig_hist.update_traces(opacity=0.8)
st.plotly_chart(fig_hist, use_container_width=True)
# 8. Generate ydata-profiling Report
st.subheader("Comprehensive Profiling Report")
with st.spinner("Generating profiling report..."):
profile = ProfileReport(df, title="Profiling Report", explorative=True)
report_html = profile.to_html()
# 8a. Display the report in an iframe
st.components.v1.html(report_html, height=1200, scrolling=True)
# 8b. Download Button for HTML
st.write("### Download the Profiling Report")
st.download_button(
label="Download HTML",
data=report_html.encode('utf-8'),
file_name="profiling_report.html",
mime="text/html"
)
else:
st.info("Awaiting CSV file upload.")
# That's it!
# Simply copy and paste this into your app.py on Hugging Face Spaces.
# Make sure you have a requirements.txt that includes:
# streamlit
# pandas
# ydata-profiling
# plotly
# statsmodels (for VIF, if you need it)
|