data-clean / app.py
leadingbridge's picture
Update app.py
f440bdc verified
import gradio as gr
import pandas as pd
from datetime import datetime
import pytz
import os
def process_file(uploaded_file):
# 1. Read Excel file (first sheet automatically)
fname = uploaded_file.name.lower()
if not fname.endswith(('.xls', '.xlsx', '.xlsm')):
return "❌ Unsupported format. Please upload .xls/.xlsx/.xlsm", None
try:
df = pd.read_excel(uploaded_file.name)
except Exception as e:
return f"❌ Error reading file: {e}", None
# 2. Define the output headers in order
output_headers = [
"Usage", "District", "Address", "Longitude", "Latitude",
"Floor", "Unit", "Area", "PriceInMillion", "PricePerSquareFeet",
"InstrumentDate", "Year", "WeekNumber", "DeliveryDate", "MemoNo."
]
# 3. Prepare an empty DataFrame
output_df = pd.DataFrame(index=df.index, columns=output_headers)
# 4. Map the first 7 columns from the input to the relevant fields:
# Col 1 β†’ Address
# Col 2 β†’ Floor
# Col 3 β†’ Unit
# Col 4 β†’ Area
# Col 5 β†’ PriceInMillion
# Col 6 β†’ PricePerSquareFeet
# Col 7 β†’ InstrumentDate
output_df["Address"] = df.iloc[:, 0]
output_df["Floor"] = df.iloc[:, 1]
output_df["Unit"] = df.iloc[:, 2]
output_df["Area"] = df.iloc[:, 3]
output_df["PriceInMillion"] = df.iloc[:, 4]
output_df["PricePerSquareFeet"] = df.iloc[:, 5]
output_df["InstrumentDate"] = pd.to_datetime(df.iloc[:, 6], errors="coerce")
# 5. Derive Year and ISO Week Number from InstrumentDate
# Any invalid dates become NaT and yield NaN year/week
output_df["Year"] = output_df["InstrumentDate"].dt.year
output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week
# 6. Leave Usage, District, Longitude, Latitude, DeliveryDate, MemoNo. empty
# (or populate them here if you have logic to do so)
# 7. Generate filename based on Hong Kong date
hk_tz = pytz.timezone("Asia/Hong_Kong")
today_hk = datetime.now(hk_tz).strftime("%y%m%d")
out_fname = f"data-clean-{today_hk}.xlsx"
# 8. Save to Excel (in the current working directory)
output_df.to_excel(out_fname, index=False)
# Return the DataFrame for preview and the path to download
return output_df, out_fname
with gr.Blocks(title="Data Cleaner") as demo:
gr.Markdown("## πŸ—‚οΈ Excel β†’ Cleaned Data Mapping")
with gr.Row():
file_input = gr.File(label="Upload .xls/.xlsx/.xlsm")
run_btn = gr.Button("Process")
with gr.Row():
df_out = gr.DataFrame(label="Mapped Data Preview")
download_btn = gr.File(label="Download Cleaned File")
run_btn.click(
fn=process_file,
inputs=[file_input],
outputs=[df_out, download_btn]
)
if __name__ == "__main__":
demo.launch()