-
Notifications
You must be signed in to change notification settings - Fork 28
/
update_locations.py
87 lines (60 loc) · 1.66 KB
/
update_locations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
'replace location columns from the original file with parsed location (five columns)'
'update_locations.py Location_Tree.csv input.csv output.csv'
import sys, csv
from collections import defaultdict
from pprint import pprint
###
def get_full_loc( loc ):
global levels
full_loc = [ loc ]
for l in range( 4, -1, -1 ):
try:
parent = levels[l][loc]
if parent is None:
break
full_loc.insert( 0, parent )
loc = parent
except KeyError:
continue
full_loc = full_loc + [ '', '', '', '', '' ]
full_loc = full_loc[:5]
return full_loc
###
loc_file = sys.argv[1]
input_file = sys.argv[2]
output_file = sys.argv[3]
# loc_file = 'data/orig/Location_Tree.csv'
loc_col = 'LocationNormalized'
l_f = open( loc_file )
reader = csv.reader( l_f, delimiter = '"' )
# first pass: build a dictionary
levels = defaultdict( dict )
for line in reader:
line = line[0].split( '~' )
parent = None
for i, loc in enumerate( line ):
levels[i][loc] = parent
parent = loc
del levels[6]
print "levels: %s" % ( sorted( levels.keys()))
# second pass: update data
i_f = open( input_file )
o_f = open( output_file, 'wb' )
reader = csv.reader( i_f )
writer = csv.writer( o_f )
# headers
headers = reader.next()
loc_col_i = headers.index( loc_col )
full_loc_headers = [ 'Loc1', 'Loc2', 'Loc3', 'Loc4', 'Loc5' ]
headers = headers[0:loc_col_i - 1] + full_loc_headers + headers[loc_col_i + 1:]
writer.writerow( headers )
n = 0
for line in reader:
loc = line[loc_col_i]
full_loc = get_full_loc( loc )
# getting rid of loc description
new_line = line[0:loc_col_i - 1] + full_loc + line[loc_col_i + 1:]
writer.writerow( new_line )
n += 1
if n % 10000 == 0:
print n