Spaces:

leadingbridge
/

data-clean

Sleeping

App Files Files Community

leadingbridge commited on 11 days ago

Commit

f440bdc

verified ·

1 Parent(s): 07f4651

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -51

app.py CHANGED Viewed

@@ -1,70 +1,78 @@
 import gradio as gr
 import pandas as pd
 from datetime import datetime
-def process_file(file):
-    # 1. Validate extension
-    name = file.name.lower()
-    if not name.endswith(('.xls', '.xlsx', '.xlsm')):
-        return "Error: Please upload a .xls, .xlsx or .xlsm file.", None
-    # 2. Read sheet 0 without header rows (so data starts at row 0)
-    df = pd.read_excel(file.name, header=None)
-    df = df.dropna(axis=1, how="all")  # drop any fully empty columns
-    # 3. Prepare output schema
     output_headers = [
         "Usage", "District", "Address", "Longitude", "Latitude",
-        "Floor", "Unit", "Area", "PriceInMillion",
-        "PricePerSquareFeet", "InstrumentDate", "Year",
-        "WeekNumber", "DeliveryDate", "MemoNo."
     ]
-    output_df = pd.DataFrame("", index=range(len(df)), columns=output_headers)
-    # 4. Map exactly as requested:
-    output_df["Address"]             = df.iloc[:, 0]  # col 1 → Address
-    output_df["Floor"]               = df.iloc[:, 1]  # col 2 → Floor
-    output_df["Unit"]                = df.iloc[:, 2]  # col 3 → Unit
-    output_df["Area"]                = df.iloc[:, 3]  # col 4 → Area
-    output_df["PriceInMillion"]      = pd.to_numeric(
-                                          df.iloc[:, 4]
-                                            .replace(r"[^0-9\.]", "", regex=True),
-                                          errors="coerce"
-                                      )               # col 5 → PriceInMillion
-    output_df["PricePerSquareFeet"]  = pd.to_numeric(
-                                          df.iloc[:, 5]
-                                            .replace(r"[^0-9\.]", "", regex=True),
-                                          errors="coerce"
-                                      )               # col 6 → PricePerSquareFeet
-    output_df["InstrumentDate"]      = pd.to_datetime(
-                                          df.iloc[:, 6],
-                                          errors="coerce"
-                                      )               # col 7 → InstrumentDate
-    # 5. Derive Year & WeekNumber
-    output_df["Year"]       = output_df["InstrumentDate"].dt.year
     output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week
-    # 6. (Leave DeliveryDate & MemoNo. blank unless you have source columns)
-    # output_df["DeliveryDate"] = ...
-    # output_df["MemoNo."]      = ...
-    # 7. Generate output filename
-    suffix   = datetime.now().strftime("%y%m%d")
-    out_name = f"data-clean-{suffix}.xlsx"
-    # 8. Save and return
-    output_df.to_excel(out_name, index=False)
-    return output_df, out_name
-with gr.Blocks(title="Excel → data‑clean Mapper") as demo:
-    gr.Markdown("## Upload your .xls/.xlsx/.xlsm file for data‑clean mapping")
     with gr.Row():
-        file_in = gr.File(label="Input File")
-        btn     = gr.Button("Process")
     with gr.Row():
-        df_out    = gr.Dataframe(label="Mapped Data")
-        download  = gr.File(label="Download Output")
-    btn.click(process_file, inputs=[file_in], outputs=[df_out, download])
-demo.launch()

 import gradio as gr
 import pandas as pd
 from datetime import datetime
+import pytz
+import os
+def process_file(uploaded_file):
+    # 1. Read Excel file (first sheet automatically)
+    fname = uploaded_file.name.lower()
+    if not fname.endswith(('.xls', '.xlsx', '.xlsm')):
+        return "❌ Unsupported format. Please upload .xls/.xlsx/.xlsm", None
+    try:
+        df = pd.read_excel(uploaded_file.name)
+    except Exception as e:
+        return f"❌ Error reading file: {e}", None
+    # 2. Define the output headers in order
     output_headers = [
         "Usage", "District", "Address", "Longitude", "Latitude",
+        "Floor", "Unit", "Area", "PriceInMillion", "PricePerSquareFeet",
+        "InstrumentDate", "Year", "WeekNumber", "DeliveryDate", "MemoNo."
     ]
+    # 3. Prepare an empty DataFrame
+    output_df = pd.DataFrame(index=df.index, columns=output_headers)
+    # 4. Map the first 7 columns from the input to the relevant fields:
+    #    Col 1 → Address
+    #    Col 2 → Floor
+    #    Col 3 → Unit
+    #    Col 4 → Area
+    #    Col 5 → PriceInMillion
+    #    Col 6 → PricePerSquareFeet
+    #    Col 7 → InstrumentDate
+    output_df["Address"]            = df.iloc[:, 0]
+    output_df["Floor"]              = df.iloc[:, 1]
+    output_df["Unit"]               = df.iloc[:, 2]
+    output_df["Area"]               = df.iloc[:, 3]
+    output_df["PriceInMillion"]     = df.iloc[:, 4]
+    output_df["PricePerSquareFeet"] = df.iloc[:, 5]
+    output_df["InstrumentDate"]     = pd.to_datetime(df.iloc[:, 6], errors="coerce")
+    # 5. Derive Year and ISO Week Number from InstrumentDate
+    #    Any invalid dates become NaT and yield NaN year/week
+    output_df["Year"] = output_df["InstrumentDate"].dt.year
     output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week
+    # 6. Leave Usage, District, Longitude, Latitude, DeliveryDate, MemoNo. empty
+    #    (or populate them here if you have logic to do so)
+    # 7. Generate filename based on Hong Kong date
+    hk_tz = pytz.timezone("Asia/Hong_Kong")
+    today_hk = datetime.now(hk_tz).strftime("%y%m%d")
+    out_fname = f"data-clean-{today_hk}.xlsx"
+    # 8. Save to Excel (in the current working directory)
+    output_df.to_excel(out_fname, index=False)
+    # Return the DataFrame for preview and the path to download
+    return output_df, out_fname
+with gr.Blocks(title="Data Cleaner") as demo:
+    gr.Markdown("## 🗂️ Excel → Cleaned Data Mapping")
     with gr.Row():
+        file_input = gr.File(label="Upload .xls/.xlsx/.xlsm")
+        run_btn    = gr.Button("Process")
     with gr.Row():
+        df_out       = gr.DataFrame(label="Mapped Data Preview")
+        download_btn = gr.File(label="Download Cleaned File")
+    run_btn.click(
+        fn=process_file,
+        inputs=[file_input],
+        outputs=[df_out, download_btn]
+    )
+if __name__ == "__main__":
+    demo.launch()