-
Notifications
You must be signed in to change notification settings - Fork 8
/
dump.py
executable file
·105 lines (85 loc) · 3.44 KB
/
dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
from typing import List, Iterable
import kyfw
import hyfw
import tmis
from stations import path, dump_stations
from util import shell, progress, open
def combine_stations() -> Iterable[List[str]]:
'Combine the two railway station datasets by telecode.'
stations = {}
names = {}
stations_95306 = hyfw.dfs()
stations_12306 = kyfw.stations()
for s in stations_95306:
pinyin = s['PYM'].lower()
if len(pinyin) > 3:
pinyin = pinyin[:2] + pinyin[-1]
elif len(pinyin) < 3:
pinyin = ''
name, telecode = s['ZMHZ'], s['DBM']
stations[telecode] = [pinyin, name, s['TMIS'], s['SSJC']]
names[name] = telecode
for s in stations_12306:
old = stations.get(s.telecode)
new = [s.pinyin_code, s.name, '', '']
if s.name in names and s.telecode != names[s.name]:
conflict = '%s/%s -> %s/%s' % (names[s.name], new, s.telecode, old)
if s.telecode in stations:
conflict = 'Name conflict: %s -> ?' % conflict
else:
conflict = 'Solved conflict: %s' % conflict
old = stations[s.telecode] = stations.pop(names[s.name])
old[:2] = new[:2]
elif s.telecode in stations and s.name != stations[s.telecode][1]:
new[-2] = tmis(s.name).get(s.name, '')
conflict = '%s/(%s => %s)' % (s.telecode, old, new)
if new[-2] and old[-2] != new[-2]: # TMIS codes conflict
conflict = 'Ambiguous telecode: %s' % conflict
else:
conflict = 'Solved conflict: %s' % conflict
old[:2] = new[:2]
else:
if s.telecode not in stations:
stations[s.telecode] = [''] * 4
stations[s.telecode][:2] = new[:2]
continue
# resolve merge conflicts manually
shell(dict(vars(), s=stations), '\n%s' % conflict)
for k, v in stations.items():
# drop telecodes with spaces
# so those can be used as temporary names in conflict solving
v.insert(2, '' if ' ' in k else k)
yield v
def heuristic_search(stations, initials=None) -> Iterable[List[str]]:
'Search the TMIS database using name initials.'
# create indexes for faster lookup
names, tmis_codes = (
{s[field]: index for index, s in enumerate(stations)}
for field in (1, -2)
)
if not initials:
initials = {name[0] for name in names}.union(
{s[-1] for s in stations}
)
for initial in initials:
progress()
for name, tmis_code in tmis.dfs(initial).items():
# append as a new station
if name not in names and tmis_code not in tmis_codes:
yield ['', name, '', tmis_code, '']
# replace in-place
elif name in names:
old = stations[names[name]]
if not old[-2]:
old[-2] = tmis_code
elif old[-2] != tmis_code:
conflict = 'TMIS code conflict: %s' % old
shell(dict(vars(), s=stations), '\n%s' % conflict)
if __name__ == '__main__':
stations = list(combine_stations())
stations.extend(heuristic_search(stations))
shell(dict(vars(), s=stations), 'Well done.')
with open(path, 'w') as f:
print(dump_stations(stations), file=f)
print('Dumped %d stations to "%s".' % (len(stations), path))