Spaces:

leadingbridge
/

data-clean

Running

File size: 2,901 Bytes

86bed75
14d7d35
 
f440bdc
 
0a5207e
f440bdc
 
 
 
 
0a5207e
f440bdc
 
 
 
0a5207e
f440bdc
07f4651
bd7b118
f440bdc
 
bd7b118
0a5207e
f440bdc
 
0a5207e
f440bdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a5207e
 
f440bdc
 
 
 
 
 
 
86bed75
f440bdc
 
0a5207e
f440bdc
 
0a5207e
f440bdc
 
86bed75
f440bdc
 
86bed75
f440bdc
 
 
 
 
 
 
0a5207e
f440bdc

import gradio as gr
import pandas as pd
from datetime import datetime
import pytz
import os

def process_file(uploaded_file):
    # 1. Read Excel file (first sheet automatically)
    fname = uploaded_file.name.lower()
    if not fname.endswith(('.xls', '.xlsx', '.xlsm')):
        return "❌ Unsupported format. Please upload .xls/.xlsx/.xlsm", None

    try:
        df = pd.read_excel(uploaded_file.name)
    except Exception as e:
        return f"❌ Error reading file: {e}", None

    # 2. Define the output headers in order
    output_headers = [
        "Usage", "District", "Address", "Longitude", "Latitude",
        "Floor", "Unit", "Area", "PriceInMillion", "PricePerSquareFeet",
        "InstrumentDate", "Year", "WeekNumber", "DeliveryDate", "MemoNo."
    ]

    # 3. Prepare an empty DataFrame
    output_df = pd.DataFrame(index=df.index, columns=output_headers)

    # 4. Map the first 7 columns from the input to the relevant fields:
    #    Col 1 → Address
    #    Col 2 → Floor
    #    Col 3 → Unit
    #    Col 4 → Area
    #    Col 5 → PriceInMillion
    #    Col 6 → PricePerSquareFeet
    #    Col 7 → InstrumentDate
    output_df["Address"]            = df.iloc[:, 0]
    output_df["Floor"]              = df.iloc[:, 1]
    output_df["Unit"]               = df.iloc[:, 2]
    output_df["Area"]               = df.iloc[:, 3]
    output_df["PriceInMillion"]     = df.iloc[:, 4]
    output_df["PricePerSquareFeet"] = df.iloc[:, 5]
    output_df["InstrumentDate"]     = pd.to_datetime(df.iloc[:, 6], errors="coerce")

    # 5. Derive Year and ISO Week Number from InstrumentDate
    #    Any invalid dates become NaT and yield NaN year/week
    output_df["Year"] = output_df["InstrumentDate"].dt.year
    output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week

    # 6. Leave Usage, District, Longitude, Latitude, DeliveryDate, MemoNo. empty
    #    (or populate them here if you have logic to do so)

    # 7. Generate filename based on Hong Kong date
    hk_tz = pytz.timezone("Asia/Hong_Kong")
    today_hk = datetime.now(hk_tz).strftime("%y%m%d")
    out_fname = f"data-clean-{today_hk}.xlsx"

    # 8. Save to Excel (in the current working directory)
    output_df.to_excel(out_fname, index=False)

    # Return the DataFrame for preview and the path to download
    return output_df, out_fname

with gr.Blocks(title="Data Cleaner") as demo:
    gr.Markdown("## 🗂️ Excel → Cleaned Data Mapping")
    with gr.Row():
        file_input = gr.File(label="Upload .xls/.xlsx/.xlsm")
        run_btn    = gr.Button("Process")
    with gr.Row():
        df_out       = gr.DataFrame(label="Mapped Data Preview")
        download_btn = gr.File(label="Download Cleaned File")
    run_btn.click(
        fn=process_file,
        inputs=[file_input],
        outputs=[df_out, download_btn]
    )

if __name__ == "__main__":
    demo.launch()