File size: 11,247 Bytes
300be5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os
import time
import requests
from geopy.geocoders import Nominatim
# Set random seed for reproducibility
np.random.seed(42)
def generate_delivery_data(n_deliveries=50, use_geocoding=False):
"""
Generate synthetic delivery data with realistic Singapore addresses
"""
# Define real Singapore neighborhoods and their actual streets
# Format: [neighborhood_name, [list_of_real_streets], postal_code_prefix]
sg_neighborhoods = [
['Ang Mo Kio', ['Ang Mo Kio Avenue 1', 'Ang Mo Kio Avenue 3', 'Ang Mo Kio Avenue 4', 'Ang Mo Kio Avenue 10'], '56'],
['Bedok', ['Bedok North Avenue 1', 'Bedok North Road', 'Bedok Reservoir Road', 'New Upper Changi Road'], '46'],
['Bishan', ['Bishan Street 11', 'Bishan Street 12', 'Bishan Street 13', 'Bishan Street 22'], '57'],
['Bukit Merah', ['Jalan Bukit Merah', 'Henderson Road', 'Tiong Bahru Road', 'Redhill Close'], '15'],
['Bukit Batok', ['Bukit Batok East Avenue 6', 'Bukit Batok West Avenue 8', 'Bukit Batok Street 21'], '65'],
['Clementi', ['Clementi Avenue 1', 'Clementi Avenue 4', 'Clementi Road', 'Commonwealth Avenue West'], '12'],
['Geylang', ['Geylang East Avenue 1', 'Geylang Road', 'Guillemard Road', 'Sims Avenue'], '38'],
['Hougang', ['Hougang Avenue 1', 'Hougang Avenue 7', 'Hougang Street 91', 'Upper Serangoon Road'], '53'],
['Jurong East', ['Jurong East Street 13', 'Jurong East Avenue 1', 'Jurong Gateway Road'], '60'],
['Jurong West', ['Jurong West Street 41', 'Jurong West Street 52', 'Jurong West Street 93'], '64'],
['Kallang', ['Kallang Avenue', 'Geylang Bahru', 'Boon Keng Road', 'Upper Boon Keng Road'], '33'],
['Punggol', ['Punggol Central', 'Punggol Field', 'Punggol Road', 'Punggol Way'], '82'],
['Queenstown', ['Commonwealth Avenue', 'Commonwealth Drive', 'Mei Chin Road', 'Stirling Road'], '14'],
['Sengkang', ['Sengkang East Way', 'Sengkang West Way', 'Compassvale Road', 'Fernvale Road'], '54'],
['Serangoon', ['Serangoon Avenue 2', 'Serangoon Avenue 3', 'Serangoon North Avenue 1'], '55'],
['Tampines', ['Tampines Street 11', 'Tampines Street 21', 'Tampines Avenue 1', 'Tampines Avenue 4'], '52'],
['Toa Payoh', ['Toa Payoh Lorong 1', 'Toa Payoh Lorong 2', 'Toa Payoh Lorong 4', 'Toa Payoh Central'], '31'],
['Woodlands', ['Woodlands Avenue 1', 'Woodlands Drive 16', 'Woodlands Drive 72', 'Woodlands Circle'], '73'],
['Yishun', ['Yishun Avenue 1', 'Yishun Avenue 4', 'Yishun Ring Road', 'Yishun Street 22'], '76']
]
# Bounding boxes for neighborhoods (for fallback coordinates)
# Format: [name, min_lat, max_lat, min_lon, max_lon]
neighborhood_bounds = {
'Ang Mo Kio': [1.360000, 1.380000, 103.830000, 103.860000],
'Bedok': [1.320000, 1.335000, 103.920000, 103.950000],
'Bishan': [1.345000, 1.360000, 103.830000, 103.855000],
'Bukit Merah': [1.270000, 1.290000, 103.800000, 103.830000],
'Bukit Batok': [1.340000, 1.360000, 103.740000, 103.770000],
'Clementi': [1.310000, 1.325000, 103.750000, 103.780000],
'Geylang': [1.310000, 1.325000, 103.880000, 103.900000],
'Hougang': [1.370000, 1.385000, 103.880000, 103.900000],
'Jurong East': [1.330000, 1.345000, 103.730000, 103.750000],
'Jurong West': [1.340000, 1.360000, 103.690000, 103.720000],
'Kallang': [1.300000, 1.320000, 103.850000, 103.880000],
'Punggol': [1.390000, 1.410000, 103.900000, 103.920000],
'Queenstown': [1.290000, 1.310000, 103.780000, 103.805000],
'Sengkang': [1.380000, 1.395000, 103.870000, 103.900000],
'Serangoon': [1.345000, 1.360000, 103.865000, 103.885000],
'Tampines': [1.345000, 1.365000, 103.930000, 103.960000],
'Toa Payoh': [1.326000, 1.341000, 103.840000, 103.865000],
'Woodlands': [1.430000, 1.450000, 103.770000, 103.800000],
'Yishun': [1.410000, 1.430000, 103.820000, 103.850000]
}
# Generate delivery IDs
delivery_ids = [f'DEL{str(i).zfill(4)}' for i in range(1, n_deliveries + 1)]
# Generate customer names (fictional)
first_names = ['Tan', 'Lim', 'Lee', 'Ng', 'Wong', 'Chan', 'Goh', 'Ong', 'Teo', 'Koh',
'Chua', 'Loh', 'Yeo', 'Sim', 'Ho', 'Ang', 'Tay', 'Yap', 'Leong', 'Foo']
last_names = ['Wei', 'Ming', 'Hui', 'Ling', 'Yong', 'Jun', 'Hong', 'Xin', 'Yi', 'Jie',
'Cheng', 'Kai', 'Zhi', 'Tian', 'Yu', 'En', 'Yang', 'Hao', 'Chong', 'Zheng']
customer_names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n_deliveries)]
addresses = []
postal_codes = []
latitudes = []
longitudes = []
neighborhood_names = []
# Initialize geocoder if using geocoding
if use_geocoding:
geolocator = Nominatim(user_agent="delivery_app")
# Generate realistic addresses
for i in range(n_deliveries):
# Randomly select a neighborhood
neighborhood_data = random.choice(sg_neighborhoods)
neighborhood = neighborhood_data[0]
streets = neighborhood_data[1]
postal_prefix = neighborhood_data[2]
# Randomly select a street in that neighborhood
street = random.choice(streets)
# Generate block number (realistic for HDB)
block = random.randint(100, 600)
# Generate unit number
unit_floor = random.randint(2, 20)
unit_number = random.randint(1, 150)
# Generate postal code (with realistic prefix)
postal_suffix = str(random.randint(0, 999)).zfill(3)
postal_code = postal_prefix + postal_suffix
# Create two formats of address - one for display, one for geocoding
display_address = f"Block {block}, #{unit_floor:02d}-{unit_number:03d}, {street}, Singapore {postal_code}"
geocode_address = f"{block} {street}, Singapore {postal_code}" # Simpler format for geocoding
# Default coordinates from neighborhood bounding box (fallback)
bounds = neighborhood_bounds[neighborhood]
default_lat = round(random.uniform(bounds[0], bounds[1]), 6)
default_lon = round(random.uniform(bounds[2], bounds[3]), 6)
# Use geocoding API if requested
if use_geocoding:
try:
location = geolocator.geocode(geocode_address)
if location:
lat = location.latitude
lon = location.longitude
print(f"β Successfully geocoded: {geocode_address} β ({lat}, {lon})")
else:
# First fallback: try with just street and postal code
simpler_address = f"{street}, Singapore {postal_code}"
location = geolocator.geocode(simpler_address)
if location:
lat = location.latitude
lon = location.longitude
print(f"β Fallback geocoded: {simpler_address} β ({lat}, {lon})")
else:
# Second fallback: just use the neighborhood center
lat = default_lat
lon = default_lon
print(f"β Could not geocode: {geocode_address}, using neighborhood coordinates")
# Add delay to avoid being rate limited
time.sleep(1)
except Exception as e:
print(f"β Geocoding error for {geocode_address}: {str(e)}")
lat = default_lat
lon = default_lon
else:
# Without geocoding, use the default coordinates
lat = default_lat
lon = default_lon
addresses.append(display_address)
postal_codes.append(postal_code)
latitudes.append(lat)
longitudes.append(lon)
neighborhood_names.append(neighborhood)
# Generate delivery dates (within the next 7 days)
base_date = datetime.now().date()
delivery_dates = [base_date + timedelta(days=random.randint(1, 7)) for _ in range(n_deliveries)]
# Generate time windows (between 9 AM and 5 PM)
time_windows = []
for _ in range(n_deliveries):
start_hour = random.randint(9, 16)
window_length = random.choice([1, 2, 3]) # 1, 2, or 3 hour windows
end_hour = min(start_hour + window_length, 18)
start_time = f"{start_hour:02d}:00"
end_time = f"{end_hour:02d}:00"
time_windows.append(f"{start_time}-{end_time}")
# Generate package details
weights = np.random.uniform(0.5, 20.0, n_deliveries) # in kg
volumes = np.random.uniform(0.01, 0.5, n_deliveries) # in mΒ³
# Priority levels
priorities = np.random.choice(['High', 'Medium', 'Low'], n_deliveries,
p=[0.2, 0.5, 0.3]) # 20% High, 50% Medium, 30% Low
# Required vehicle type
vehicle_types = np.random.choice(['Standard', 'Large', 'Refrigerated'], n_deliveries,
p=[0.7, 0.2, 0.1])
# Status
statuses = np.random.choice(['Pending', 'Assigned', 'In Transit', 'Delivered'], n_deliveries,
p=[0.6, 0.2, 0.15, 0.05])
# Additional notes
notes = []
special_instructions = [
'Call customer before delivery',
'Fragile items',
'Leave at door',
'Signature required',
'No delivery on weekends',
None
]
for _ in range(n_deliveries):
if random.random() < 0.7: # 70% chance of having a note
notes.append(random.choice(special_instructions))
else:
notes.append(None)
# Create DataFrame
df = pd.DataFrame({
'delivery_id': delivery_ids,
'customer_name': customer_names,
'address': addresses,
'postal_code': postal_codes,
'neighborhood': neighborhood_names,
'latitude': latitudes,
'longitude': longitudes,
'delivery_date': delivery_dates,
'time_window': time_windows,
'weight_kg': weights.round(2),
'volume_m3': volumes.round(3),
'priority': priorities,
'vehicle_type': vehicle_types,
'status': statuses,
'special_instructions': notes
})
# Ensure the directory exists
data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'data', 'delivery-data')
os.makedirs(data_dir, exist_ok=True)
# Save to CSV
output_path = os.path.join(data_dir, 'delivery_data.csv')
df.to_csv(output_path, index=False)
print(f"Delivery data generated and saved to {output_path}")
return df
if __name__ == "__main__":
# Set to True if you want to use real geocoding (slower but more accurate)
USE_GEOCODING = True
delivery_data = generate_delivery_data(50, use_geocoding=USE_GEOCODING)
print("Sample of delivery data:")
print(delivery_data.head())
|