File size: 11,247 Bytes
300be5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os
import time
import requests
from geopy.geocoders import Nominatim

# Set random seed for reproducibility
np.random.seed(42)

def generate_delivery_data(n_deliveries=50, use_geocoding=False):
    """
    Generate synthetic delivery data with realistic Singapore addresses
    """
    # Define real Singapore neighborhoods and their actual streets
    # Format: [neighborhood_name, [list_of_real_streets], postal_code_prefix]
    sg_neighborhoods = [
        ['Ang Mo Kio', ['Ang Mo Kio Avenue 1', 'Ang Mo Kio Avenue 3', 'Ang Mo Kio Avenue 4', 'Ang Mo Kio Avenue 10'], '56'],
        ['Bedok', ['Bedok North Avenue 1', 'Bedok North Road', 'Bedok Reservoir Road', 'New Upper Changi Road'], '46'],
        ['Bishan', ['Bishan Street 11', 'Bishan Street 12', 'Bishan Street 13', 'Bishan Street 22'], '57'],
        ['Bukit Merah', ['Jalan Bukit Merah', 'Henderson Road', 'Tiong Bahru Road', 'Redhill Close'], '15'],
        ['Bukit Batok', ['Bukit Batok East Avenue 6', 'Bukit Batok West Avenue 8', 'Bukit Batok Street 21'], '65'],
        ['Clementi', ['Clementi Avenue 1', 'Clementi Avenue 4', 'Clementi Road', 'Commonwealth Avenue West'], '12'],
        ['Geylang', ['Geylang East Avenue 1', 'Geylang Road', 'Guillemard Road', 'Sims Avenue'], '38'],
        ['Hougang', ['Hougang Avenue 1', 'Hougang Avenue 7', 'Hougang Street 91', 'Upper Serangoon Road'], '53'],
        ['Jurong East', ['Jurong East Street 13', 'Jurong East Avenue 1', 'Jurong Gateway Road'], '60'],
        ['Jurong West', ['Jurong West Street 41', 'Jurong West Street 52', 'Jurong West Street 93'], '64'],
        ['Kallang', ['Kallang Avenue', 'Geylang Bahru', 'Boon Keng Road', 'Upper Boon Keng Road'], '33'],
        ['Punggol', ['Punggol Central', 'Punggol Field', 'Punggol Road', 'Punggol Way'], '82'],
        ['Queenstown', ['Commonwealth Avenue', 'Commonwealth Drive', 'Mei Chin Road', 'Stirling Road'], '14'],
        ['Sengkang', ['Sengkang East Way', 'Sengkang West Way', 'Compassvale Road', 'Fernvale Road'], '54'],
        ['Serangoon', ['Serangoon Avenue 2', 'Serangoon Avenue 3', 'Serangoon North Avenue 1'], '55'],
        ['Tampines', ['Tampines Street 11', 'Tampines Street 21', 'Tampines Avenue 1', 'Tampines Avenue 4'], '52'],
        ['Toa Payoh', ['Toa Payoh Lorong 1', 'Toa Payoh Lorong 2', 'Toa Payoh Lorong 4', 'Toa Payoh Central'], '31'],
        ['Woodlands', ['Woodlands Avenue 1', 'Woodlands Drive 16', 'Woodlands Drive 72', 'Woodlands Circle'], '73'],
        ['Yishun', ['Yishun Avenue 1', 'Yishun Avenue 4', 'Yishun Ring Road', 'Yishun Street 22'], '76']
    ]
    
    # Bounding boxes for neighborhoods (for fallback coordinates)
    # Format: [name, min_lat, max_lat, min_lon, max_lon]
    neighborhood_bounds = {
        'Ang Mo Kio': [1.360000, 1.380000, 103.830000, 103.860000],
        'Bedok': [1.320000, 1.335000, 103.920000, 103.950000],
        'Bishan': [1.345000, 1.360000, 103.830000, 103.855000],
        'Bukit Merah': [1.270000, 1.290000, 103.800000, 103.830000],
        'Bukit Batok': [1.340000, 1.360000, 103.740000, 103.770000],
        'Clementi': [1.310000, 1.325000, 103.750000, 103.780000],
        'Geylang': [1.310000, 1.325000, 103.880000, 103.900000],
        'Hougang': [1.370000, 1.385000, 103.880000, 103.900000],
        'Jurong East': [1.330000, 1.345000, 103.730000, 103.750000],
        'Jurong West': [1.340000, 1.360000, 103.690000, 103.720000],
        'Kallang': [1.300000, 1.320000, 103.850000, 103.880000],
        'Punggol': [1.390000, 1.410000, 103.900000, 103.920000],
        'Queenstown': [1.290000, 1.310000, 103.780000, 103.805000],
        'Sengkang': [1.380000, 1.395000, 103.870000, 103.900000],
        'Serangoon': [1.345000, 1.360000, 103.865000, 103.885000],
        'Tampines': [1.345000, 1.365000, 103.930000, 103.960000],
        'Toa Payoh': [1.326000, 1.341000, 103.840000, 103.865000],
        'Woodlands': [1.430000, 1.450000, 103.770000, 103.800000],
        'Yishun': [1.410000, 1.430000, 103.820000, 103.850000]
    }
    
    # Generate delivery IDs
    delivery_ids = [f'DEL{str(i).zfill(4)}' for i in range(1, n_deliveries + 1)]
    
    # Generate customer names (fictional)
    first_names = ['Tan', 'Lim', 'Lee', 'Ng', 'Wong', 'Chan', 'Goh', 'Ong', 'Teo', 'Koh', 
                   'Chua', 'Loh', 'Yeo', 'Sim', 'Ho', 'Ang', 'Tay', 'Yap', 'Leong', 'Foo']
    last_names = ['Wei', 'Ming', 'Hui', 'Ling', 'Yong', 'Jun', 'Hong', 'Xin', 'Yi', 'Jie',
                  'Cheng', 'Kai', 'Zhi', 'Tian', 'Yu', 'En', 'Yang', 'Hao', 'Chong', 'Zheng']
    customer_names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n_deliveries)]
    
    addresses = []
    postal_codes = []
    latitudes = []
    longitudes = []
    neighborhood_names = []
    
    # Initialize geocoder if using geocoding
    if use_geocoding:
        geolocator = Nominatim(user_agent="delivery_app")
    
    # Generate realistic addresses
    for i in range(n_deliveries):
        # Randomly select a neighborhood
        neighborhood_data = random.choice(sg_neighborhoods)
        neighborhood = neighborhood_data[0]
        streets = neighborhood_data[1]
        postal_prefix = neighborhood_data[2]
        
        # Randomly select a street in that neighborhood
        street = random.choice(streets)
        
        # Generate block number (realistic for HDB)
        block = random.randint(100, 600)
        
        # Generate unit number
        unit_floor = random.randint(2, 20)
        unit_number = random.randint(1, 150)
        
        # Generate postal code (with realistic prefix)
        postal_suffix = str(random.randint(0, 999)).zfill(3)
        postal_code = postal_prefix + postal_suffix
        
        # Create two formats of address - one for display, one for geocoding
        display_address = f"Block {block}, #{unit_floor:02d}-{unit_number:03d}, {street}, Singapore {postal_code}"
        geocode_address = f"{block} {street}, Singapore {postal_code}"  # Simpler format for geocoding
        
        # Default coordinates from neighborhood bounding box (fallback)
        bounds = neighborhood_bounds[neighborhood]
        default_lat = round(random.uniform(bounds[0], bounds[1]), 6)
        default_lon = round(random.uniform(bounds[2], bounds[3]), 6)
        
        # Use geocoding API if requested
        if use_geocoding:
            try:
                location = geolocator.geocode(geocode_address)
                
                if location:
                    lat = location.latitude
                    lon = location.longitude
                    print(f"βœ“ Successfully geocoded: {geocode_address} β†’ ({lat}, {lon})")
                else:
                    # First fallback: try with just street and postal code
                    simpler_address = f"{street}, Singapore {postal_code}"
                    location = geolocator.geocode(simpler_address)
                    
                    if location:
                        lat = location.latitude
                        lon = location.longitude
                        print(f"βœ“ Fallback geocoded: {simpler_address} β†’ ({lat}, {lon})")
                    else:
                        # Second fallback: just use the neighborhood center
                        lat = default_lat
                        lon = default_lon
                        print(f"βœ— Could not geocode: {geocode_address}, using neighborhood coordinates")
                
                # Add delay to avoid being rate limited
                time.sleep(1)
                
            except Exception as e:
                print(f"βœ— Geocoding error for {geocode_address}: {str(e)}")
                lat = default_lat
                lon = default_lon
        else:
            # Without geocoding, use the default coordinates
            lat = default_lat
            lon = default_lon
            
        addresses.append(display_address)
        postal_codes.append(postal_code)
        latitudes.append(lat)
        longitudes.append(lon)
        neighborhood_names.append(neighborhood)
    
    # Generate delivery dates (within the next 7 days)
    base_date = datetime.now().date()
    delivery_dates = [base_date + timedelta(days=random.randint(1, 7)) for _ in range(n_deliveries)]
    
    # Generate time windows (between 9 AM and 5 PM)
    time_windows = []
    for _ in range(n_deliveries):
        start_hour = random.randint(9, 16)
        window_length = random.choice([1, 2, 3])  # 1, 2, or 3 hour windows
        end_hour = min(start_hour + window_length, 18)
        
        start_time = f"{start_hour:02d}:00"
        end_time = f"{end_hour:02d}:00"
        time_windows.append(f"{start_time}-{end_time}")
    
    # Generate package details
    weights = np.random.uniform(0.5, 20.0, n_deliveries)  # in kg
    volumes = np.random.uniform(0.01, 0.5, n_deliveries)  # in mΒ³
    
    # Priority levels
    priorities = np.random.choice(['High', 'Medium', 'Low'], n_deliveries, 
                                 p=[0.2, 0.5, 0.3])  # 20% High, 50% Medium, 30% Low
    
    # Required vehicle type
    vehicle_types = np.random.choice(['Standard', 'Large', 'Refrigerated'], n_deliveries,
                                   p=[0.7, 0.2, 0.1])
    
    # Status
    statuses = np.random.choice(['Pending', 'Assigned', 'In Transit', 'Delivered'], n_deliveries,
                              p=[0.6, 0.2, 0.15, 0.05])
    
    # Additional notes
    notes = []
    special_instructions = [
        'Call customer before delivery', 
        'Fragile items', 
        'Leave at door',
        'Signature required',
        'No delivery on weekends',
        None
    ]
    
    for _ in range(n_deliveries):
        if random.random() < 0.7:  # 70% chance of having a note
            notes.append(random.choice(special_instructions))
        else:
            notes.append(None)
    
    # Create DataFrame
    df = pd.DataFrame({
        'delivery_id': delivery_ids,
        'customer_name': customer_names,
        'address': addresses,
        'postal_code': postal_codes,
        'neighborhood': neighborhood_names,
        'latitude': latitudes,
        'longitude': longitudes,
        'delivery_date': delivery_dates,
        'time_window': time_windows,
        'weight_kg': weights.round(2),
        'volume_m3': volumes.round(3),
        'priority': priorities,
        'vehicle_type': vehicle_types,
        'status': statuses,
        'special_instructions': notes
    })
    
    # Ensure the directory exists
    data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'data', 'delivery-data')
    os.makedirs(data_dir, exist_ok=True)
    
    # Save to CSV
    output_path = os.path.join(data_dir, 'delivery_data.csv')
    df.to_csv(output_path, index=False)
    print(f"Delivery data generated and saved to {output_path}")
    return df

if __name__ == "__main__":
    # Set to True if you want to use real geocoding (slower but more accurate)
    USE_GEOCODING = True
    delivery_data = generate_delivery_data(50, use_geocoding=USE_GEOCODING)
    print("Sample of delivery data:")
    print(delivery_data.head())