-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpostcode_lookup.py
More file actions
106 lines (86 loc) · 4.37 KB
/
postcode_lookup.py
File metadata and controls
106 lines (86 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
# 1. LOAD DATA
hospitals = pd.read_csv('hospitals_refined.csv')
postcodes = pd.read_csv('postcodes_master.csv')
# 2. HAVERSINE DISTANCE (vectorised, returns km)
def haversine(lat1, lon1, lat2, lon2):
"""Great-circle distance between arrays of coordinates in km."""
R = 6371.0
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
return R * 2 * np.arcsin(np.sqrt(a))
# 3. FUNCTION TO DETERMINE SIDE OF RIVER
def get_side(pc):
pc = pc.strip().upper()
# Specific postcodes that are exceptions — treated as South despite SW1 prefix
south_exceptions = {'SW1W9FJ', 'SW1P9UP', 'SW1Y6YQ', 'SW1Y5WT',
'SW1Y4ZB', 'SW1Y6WZ', 'SW1Y5ZP', 'SW1X9UQ', 'SW1X8AX', 'SW1Y5WD', "SW39GA", 'SW39EA', 'SW39BW',
'SW39DU', 'SW39EG', 'SW39GG', 'SW1Y5ZG', 'SW1X9ZU','SW39FX', 'SW39DX' }
if pc.replace(' ', '') in south_exceptions:
return 'South'
# UK incode is always the last 3 characters; outcode is everything before that
outcode = pc[:-3].strip() if len(pc) > 3 else pc.split(' ')[0]
north_prefixes = {'N', 'NW', 'E', 'EN', 'HA', 'IG', 'RM', 'UB', 'WD', 'WC', 'EC', 'W'}
# SW districts that sit north of the Thames
north_sw_districts = {'1', '3', '5', '6', '7', '10'}
# Extract only LEADING letters as the area code (WC2R → WC, EC1A → EC, SW1A → SW)
prefix = ''
for c in outcode:
if c.isalpha():
prefix += c
else:
break
if prefix == 'SW':
# Extract the district number from the outcode (SW1V → '1', SW10 → '10')
district_num = ''.join(c for c in outcode[len(prefix):] if c.isdigit())
return 'North' if district_num in north_sw_districts else 'South'
return 'North' if prefix in north_prefixes else 'South'
# 4. VECTORISED SPATIAL SEARCH
def find_nearest(pc_group, hosp_subset):
"""Find nearest hospital for a batch of postcodes using a single KD-tree."""
n = len(pc_group)
if hosp_subset.empty:
return (pd.Series(["None Found"] * n, index=pc_group.index),
pd.Series([0.0] * n, index=pc_group.index))
# Scale longitude by cos(lat) so Euclidean ≈ true distance at London's latitude
cos_lat = np.cos(np.radians(51.5))
scale = np.array([1.0, cos_lat])
hosp_coords = hosp_subset[['Latitude', 'Longitude']].values * scale
pc_coords = pc_group[['Latitude', 'Longitude']].values * scale
tree = cKDTree(hosp_coords)
_, indices = tree.query(pc_coords)
matched = hosp_subset.iloc[indices]
names = pd.Series(matched['Hospital Name'].values, index=pc_group.index)
# Accurate Haversine distances on the original (unscaled) coordinates
dists = haversine(
pc_group['Latitude'].values, pc_group['Longitude'].values,
matched['Latitude'].values, matched['Longitude'].values
)
dists = pd.Series(np.round(dists, 2), index=pc_group.index)
return names, dists
# 5. EXECUTION
postcodes['Side'] = postcodes['Postcode'].apply(get_side)
results = postcodes[['Postcode', 'Latitude', 'Longitude', 'Side']].copy()
for side in ['North', 'South']:
pc_group = postcodes[postcodes['Side'] == side]
hosp_side = hospitals[hospitals['Side'].isin([side, 'Both'])]
for label, level in [('Any', None), ('L1', 1), ('L2', 2), ('L3', 3)]:
hosp_subset = hosp_side if level is None else hosp_side[hosp_side['Level'] == level]
names, dists = find_nearest(pc_group, hosp_subset.reset_index(drop=True))
results.loc[pc_group.index, f'Closest_{label}'] = names
results.loc[pc_group.index, f'Distance_{label}_km'] = dists
# 6. SAVE SEPARATE FILE PER HOSPITAL (based on Closest_Any)
import os, re
os.makedirs('output', exist_ok=True)
for hospital, group in results.groupby('Closest_Any'):
safe_name = re.sub(r'[^\w\s-]', '', hospital).strip().replace(' ', '_')
group.to_csv(f'output/{safe_name}.csv', index=False)
print(f" → output/{safe_name}.csv ({len(group)} rows)")
# 7. SAVE COMBINED FILE
results.to_csv('output/All_Postcodes.csv', index=False)
print(f"\n → output/All_Postcodes.csv ({len(results)} rows)")
print(f"\nDone! {len(results)} postcodes across {results['Closest_Any'].nunique()} hospital files + 1 combined file.")