-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_bins.py
120 lines (117 loc) · 4.05 KB
/
create_bins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""This script creates Jenks-based bins for visualization based on all
available score data.
"""
import pandas as pd
import jenkspy
import numpy as np
import os
data_folder = r"C:\Users\Willem\Documents\Project\TransitCenter Equity Pulse\data"
regions = ['boston', 'chicago', 'dc', 'la', 'nyc', 'philadelphia', 'sf']
score_keys = [
'C000_P_c30_AM_autoN_fareN',
'C000_P_c30_AM_autoN_fareY',
'C000_P_c45_AM_autoN_fareN',
'C000_P_c45_AM_autoN_fareY',
'C000_P_c60_AM_autoN_fareN',
'C000_P_c60_AM_autoN_fareY',
'CE01_P_c30_AM_autoN_fareN',
'CE01_P_c30_AM_autoN_fareY',
'CE01_P_c45_AM_autoN_fareN',
'CE01_P_c45_AM_autoN_fareY',
'CE01_P_c60_AM_autoN_fareN',
'CE01_P_c60_AM_autoN_fareY',
'parks_P_c15_AM_autoN_fareN',
'parks_P_c30_AM_autoN_fareN',
'C000_P_c30_AM_autoY_fareN',
'C000_P_c30_AM_autoY_fareY',
'C000_P_c45_AM_autoY_fareN',
'C000_P_c45_AM_autoY_fareY',
'C000_P_c60_AM_autoY_fareN',
'C000_P_c60_AM_autoY_fareY',
'CE01_P_c30_AM_autoY_fareN',
'CE01_P_c30_AM_autoY_fareY',
'CE01_P_c45_AM_autoY_fareN',
'CE01_P_c45_AM_autoY_fareY',
'CE01_P_c60_AM_autoY_fareN',
'CE01_P_c60_AM_autoY_fareY',
'parks_P_c15_AM_autoY_fareN',
'parks_P_c30_AM_autoY_fareN',
'C000_P_c30_PM_autoN_fareN',
'C000_P_c30_PM_autoN_fareY',
'C000_P_c45_PM_autoN_fareN',
'C000_P_c45_PM_autoN_fareY',
'C000_P_c60_PM_autoN_fareN',
'C000_P_c60_PM_autoN_fareY',
'CE01_P_c30_PM_autoN_fareN',
'CE01_P_c30_PM_autoN_fareY',
'CE01_P_c45_PM_autoN_fareN',
'CE01_P_c45_PM_autoN_fareY',
'CE01_P_c60_PM_autoN_fareN',
'CE01_P_c60_PM_autoN_fareY',
'parks_P_c15_PM_autoN_fareN',
'parks_P_c30_PM_autoN_fareN',
'C000_P_c30_PM_autoY_fareN',
'C000_P_c30_PM_autoY_fareY',
'C000_P_c45_PM_autoY_fareN',
'C000_P_c45_PM_autoY_fareY',
'C000_P_c60_PM_autoY_fareN',
'C000_P_c60_PM_autoY_fareY',
'CE01_P_c30_PM_autoY_fareN',
'CE01_P_c30_PM_autoY_fareY',
'CE01_P_c45_PM_autoY_fareN',
'CE01_P_c45_PM_autoY_fareY',
'CE01_P_c60_PM_autoY_fareN',
'CE01_P_c60_PM_autoY_fareY',
'parks_P_c15_PM_autoY_fareN',
'parks_P_c30_PM_autoY_fareN',
'C000_P_c30_WE_autoN_fareN',
'C000_P_c30_WE_autoN_fareY',
'C000_P_c45_WE_autoN_fareN',
'C000_P_c45_WE_autoN_fareY',
'C000_P_c60_WE_autoN_fareN',
'C000_P_c60_WE_autoN_fareY',
'CE01_P_c30_WE_autoN_fareN',
'CE01_P_c30_WE_autoN_fareY',
'CE01_P_c45_WE_autoN_fareN',
'CE01_P_c45_WE_autoN_fareY',
'CE01_P_c60_WE_autoN_fareN',
'CE01_P_c60_WE_autoN_fareY',
'parks_P_c15_WE_autoN_fareN',
'parks_P_c30_WE_autoN_fareN',
'C000_P_c30_WE_autoY_fareN',
'C000_P_c30_WE_autoY_fareY',
'C000_P_c45_WE_autoY_fareN',
'C000_P_c45_WE_autoY_fareY',
'C000_P_c60_WE_autoY_fareN',
'C000_P_c60_WE_autoY_fareY',
'CE01_P_c30_WE_autoY_fareN',
'CE01_P_c30_WE_autoY_fareY',
'CE01_P_c45_WE_autoY_fareN',
'CE01_P_c45_WE_autoY_fareY',
'CE01_P_c60_WE_autoY_fareN',
'CE01_P_c60_WE_autoY_fareY',
'parks_P_c15_WE_autoY_fareN',
'parks_P_c30_WE_autoY_fareN',
'los_trips_WKD',
'los_trips_SAT'
]
out = []
dtype={'bg_id':str, 'score':float, 'score_key':str, 'date':str}
for region in regions:
print(f"Running {region}")
scores = pd.read_csv(os.path.join(data_folder, 'load', region, 'scores.csv'), dtype=dtype)
print(f" Data Loaded For {region}")
# score_keys = ['CE01_P_c60_WE_autoY_fareY', 'parks_P_c15_AM_autoY_fareN']
for score_key in score_keys:
keyed = scores[scores.score_key == score_key]
max_val = keyed['score'].max()
min_val = keyed['score'].min()
breaks = jenkspy.jenks_breaks(keyed['score'].sample(5000), nb_class=5)
breaks[0] = min_val
breaks[-1] = max_val
data = {'region':region, 'score_key':score_key, 'bin_0':breaks[0], 'bin_1':breaks[1], 'bin_2':breaks[2], 'bin_3':breaks[3], 'bin_4':breaks[4], 'bin_5':breaks[5]}
out.append(data)
print(f" Jenks complete for {score_key}")
print(f"Finished {region}")
out_df = pd.DataFrame(out)
out_df.to_csv('bins.csv', index=False)