leadingbridge commited on
Commit
f440bdc
·
verified ·
1 Parent(s): 07f4651

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -51
app.py CHANGED
@@ -1,70 +1,78 @@
1
  import gradio as gr
2
  import pandas as pd
3
  from datetime import datetime
 
 
4
 
5
- def process_file(file):
6
- # 1. Validate extension
7
- name = file.name.lower()
8
- if not name.endswith(('.xls', '.xlsx', '.xlsm')):
9
- return "Error: Please upload a .xls, .xlsx or .xlsm file.", None
10
 
11
- # 2. Read sheet 0 without header rows (so data starts at row 0)
12
- df = pd.read_excel(file.name, header=None)
13
- df = df.dropna(axis=1, how="all") # drop any fully empty columns
 
14
 
15
- # 3. Prepare output schema
16
  output_headers = [
17
  "Usage", "District", "Address", "Longitude", "Latitude",
18
- "Floor", "Unit", "Area", "PriceInMillion",
19
- "PricePerSquareFeet", "InstrumentDate", "Year",
20
- "WeekNumber", "DeliveryDate", "MemoNo."
21
  ]
22
- output_df = pd.DataFrame("", index=range(len(df)), columns=output_headers)
23
 
24
- # 4. Map exactly as requested:
25
- output_df["Address"] = df.iloc[:, 0] # col 1 → Address
26
- output_df["Floor"] = df.iloc[:, 1] # col 2 → Floor
27
- output_df["Unit"] = df.iloc[:, 2] # col 3 → Unit
28
- output_df["Area"] = df.iloc[:, 3] # col 4 → Area
29
- output_df["PriceInMillion"] = pd.to_numeric(
30
- df.iloc[:, 4]
31
- .replace(r"[^0-9\.]", "", regex=True),
32
- errors="coerce"
33
- ) # col 5 → PriceInMillion
34
- output_df["PricePerSquareFeet"] = pd.to_numeric(
35
- df.iloc[:, 5]
36
- .replace(r"[^0-9\.]", "", regex=True),
37
- errors="coerce"
38
- ) # col 6 → PricePerSquareFeet
39
- output_df["InstrumentDate"] = pd.to_datetime(
40
- df.iloc[:, 6],
41
- errors="coerce"
42
- ) # col 7 → InstrumentDate
43
 
44
- # 5. Derive Year & WeekNumber
45
- output_df["Year"] = output_df["InstrumentDate"].dt.year
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week
47
 
48
- # 6. (Leave DeliveryDate & MemoNo. blank unless you have source columns)
49
- # output_df["DeliveryDate"] = ...
50
- # output_df["MemoNo."] = ...
 
 
 
 
51
 
52
- # 7. Generate output filename
53
- suffix = datetime.now().strftime("%y%m%d")
54
- out_name = f"data-clean-{suffix}.xlsx"
55
 
56
- # 8. Save and return
57
- output_df.to_excel(out_name, index=False)
58
- return output_df, out_name
59
 
60
- with gr.Blocks(title="Excel → data‑clean Mapper") as demo:
61
- gr.Markdown("## Upload your .xls/.xlsx/.xlsm file for data‑clean mapping")
62
  with gr.Row():
63
- file_in = gr.File(label="Input File")
64
- btn = gr.Button("Process")
65
  with gr.Row():
66
- df_out = gr.Dataframe(label="Mapped Data")
67
- download = gr.File(label="Download Output")
68
- btn.click(process_file, inputs=[file_in], outputs=[df_out, download])
 
 
 
 
69
 
70
- demo.launch()
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from datetime import datetime
4
+ import pytz
5
+ import os
6
 
7
+ def process_file(uploaded_file):
8
+ # 1. Read Excel file (first sheet automatically)
9
+ fname = uploaded_file.name.lower()
10
+ if not fname.endswith(('.xls', '.xlsx', '.xlsm')):
11
+ return " Unsupported format. Please upload .xls/.xlsx/.xlsm", None
12
 
13
+ try:
14
+ df = pd.read_excel(uploaded_file.name)
15
+ except Exception as e:
16
+ return f"❌ Error reading file: {e}", None
17
 
18
+ # 2. Define the output headers in order
19
  output_headers = [
20
  "Usage", "District", "Address", "Longitude", "Latitude",
21
+ "Floor", "Unit", "Area", "PriceInMillion", "PricePerSquareFeet",
22
+ "InstrumentDate", "Year", "WeekNumber", "DeliveryDate", "MemoNo."
 
23
  ]
 
24
 
25
+ # 3. Prepare an empty DataFrame
26
+ output_df = pd.DataFrame(index=df.index, columns=output_headers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # 4. Map the first 7 columns from the input to the relevant fields:
29
+ # Col 1 → Address
30
+ # Col 2 → Floor
31
+ # Col 3 → Unit
32
+ # Col 4 → Area
33
+ # Col 5 → PriceInMillion
34
+ # Col 6 → PricePerSquareFeet
35
+ # Col 7 → InstrumentDate
36
+ output_df["Address"] = df.iloc[:, 0]
37
+ output_df["Floor"] = df.iloc[:, 1]
38
+ output_df["Unit"] = df.iloc[:, 2]
39
+ output_df["Area"] = df.iloc[:, 3]
40
+ output_df["PriceInMillion"] = df.iloc[:, 4]
41
+ output_df["PricePerSquareFeet"] = df.iloc[:, 5]
42
+ output_df["InstrumentDate"] = pd.to_datetime(df.iloc[:, 6], errors="coerce")
43
+
44
+ # 5. Derive Year and ISO Week Number from InstrumentDate
45
+ # Any invalid dates become NaT and yield NaN year/week
46
+ output_df["Year"] = output_df["InstrumentDate"].dt.year
47
  output_df["WeekNumber"] = output_df["InstrumentDate"].dt.isocalendar().week
48
 
49
+ # 6. Leave Usage, District, Longitude, Latitude, DeliveryDate, MemoNo. empty
50
+ # (or populate them here if you have logic to do so)
51
+
52
+ # 7. Generate filename based on Hong Kong date
53
+ hk_tz = pytz.timezone("Asia/Hong_Kong")
54
+ today_hk = datetime.now(hk_tz).strftime("%y%m%d")
55
+ out_fname = f"data-clean-{today_hk}.xlsx"
56
 
57
+ # 8. Save to Excel (in the current working directory)
58
+ output_df.to_excel(out_fname, index=False)
 
59
 
60
+ # Return the DataFrame for preview and the path to download
61
+ return output_df, out_fname
 
62
 
63
+ with gr.Blocks(title="Data Cleaner") as demo:
64
+ gr.Markdown("## 🗂️ Excel Cleaned Data Mapping")
65
  with gr.Row():
66
+ file_input = gr.File(label="Upload .xls/.xlsx/.xlsm")
67
+ run_btn = gr.Button("Process")
68
  with gr.Row():
69
+ df_out = gr.DataFrame(label="Mapped Data Preview")
70
+ download_btn = gr.File(label="Download Cleaned File")
71
+ run_btn.click(
72
+ fn=process_file,
73
+ inputs=[file_input],
74
+ outputs=[df_out, download_btn]
75
+ )
76
 
77
+ if __name__ == "__main__":
78
+ demo.launch()