Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from datetime import datetime | |
def process_file(file): | |
# 1. Validate extension | |
name = file.name.lower() | |
if not name.endswith(('.xls', '.xlsx', '.xlsm')): | |
return "Error: Please upload .xls/.xlsx/.xlsm file.", None | |
# 2. Read without header, drop blank columns, and skip header rows if needed | |
df = pd.read_excel(file.name, header=None) # read raw data :contentReference[oaicite:7]{index=7} | |
df = df.dropna(axis=1, how="all") # drop fully empty cols :contentReference[oaicite:8]{index=8} | |
# If your file has descriptive top rows, you can also do: | |
# df = pd.read_excel(file.name, header=None, skiprows=2) # adjust as needed :contentReference[oaicite:9]{index=9} | |
# 3. Define output schema | |
headers = [ | |
"Usage", "District", "Address", "Longitude", "Latitude", | |
"Floor", "Unit", "Area", "PriceInMillion", | |
"InstrumentDate", "Year", "WeekNumber", | |
"DeliveryDate", "MemoNo." | |
] | |
output_df = pd.DataFrame("", index=range(len(df)), columns=headers) | |
# 4. Map positional columns via iloc | |
output_df["Usage"] = df.iloc[:, 0] # Column1 → Usage :contentReference[oaicite:10]{index=10} | |
output_df["Floor"] = df.iloc[:, 1] # Column2 → Floor :contentReference[oaicite:11]{index=11} | |
output_df["Unit"] = df.iloc[:, 2] # Column3 → Unit :contentReference[oaicite:12]{index=12} | |
# 5. Clean and map Area (extract number from "507 ft2") | |
output_df["Area"] = ( | |
df.iloc[:, 3] | |
.astype(str) | |
.str.extract(r"(\d+\.?\d*)", expand=False) # extract numeric :contentReference[oaicite:13]{index=13} | |
.astype(float) | |
) | |
# 6. Map and clean PriceInMillion (remove non‐digits then convert) | |
output_df["PriceInMillion"] = pd.to_numeric( | |
df.iloc[:, 4].replace(r"[^0-9\.]", "", regex=True), | |
errors="coerce" | |
) # robust numeric conversion :contentReference[oaicite:14]{index=14} | |
# 7. Map PricePerSquareFeet into "District" if that’s your intended slot | |
output_df["District"] = df.iloc[:, 5] # Column6 → District :contentReference[oaicite:15]{index=15} | |
# 8. Parse InstrumentDate and derive Year & WeekNumber | |
output_df["InstrumentDate"] = pd.to_datetime( | |
df.iloc[:, 6], errors="coerce" | |
) # robust parsing :contentReference[oaicite:16]{index=16} | |
output_df["Year"] = output_df["InstrumentDate"].dt.year | |
output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week | |
# 9. (Optional) Leave DeliveryDate & MemoNo. blank or map if available | |
# output_df["DeliveryDate"] = ... | |
# output_df["MemoNo."] = ... | |
# 10. Save output with data‑clean‑YYMMDD filename | |
suffix = datetime.now().strftime("%y%m%d") | |
out_name = f"data-clean-{suffix}.xlsx" | |
output_df.to_excel(out_name, index=False) | |
return output_df, out_name | |
with gr.Blocks() as demo: | |
gr.Markdown("## Excel → data‑clean Mapping Tool") | |
with gr.Row(): | |
inp = gr.File(label="Upload .xls/.xlsx/.xlsm") | |
btn = gr.Button("Process") | |
with gr.Row(): | |
df_view = gr.Dataframe(label="Mapped Data") | |
df_download = gr.File(label="Download Output") | |
btn.click(process_file, inputs=[inp], outputs=[df_view, df_download]) | |
demo.launch() | |