-
Notifications
You must be signed in to change notification settings - Fork 60
/
experiment.yaml
493 lines (461 loc) · 23.4 KB
/
experiment.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
---
# CONFIG_VERSION
# The experiment configuration changes from time to time, and we upgrade the
# triage.experiments.CONFIG_VERSION variable whenever drastic changes that break
# old configuration files are released. Be sure to assign the config version
# that matches the triage.experiments.CONFIG_VERSION in the triage release
# you are developing against!
config_version: 'v8'
# EXPERIMENT METADATA
# model_comment (optional) will end up in the model_comment column of the
# models table for each model created in this experiment
model_comment: 'test'
# random_seed will be set in Python at the beginning of the experiment and
# sets all random seeds for all models
# If you don't specify a value in this block, it will be set for you and stored in the database
random_seed: 23895478
# TIME SPLITTING
# The time window to look at, and how to divide the window into
# train/test splits
#
# Most of these values are optional. The following are the
# default values that you will get if they are not set
# mode_update_frequency = 100y
# training_as_of_date_frequencies = 100y
# test_as_fo_date_frequencies = 100y
# max_training_histories = 0d
# test_durations = 0d
# feature_start_time = label_start_time = min(date) in your event's tables (from_obj in feature_aggregations)
# feature_end_time = label_end_time = max(date) in your event's tables (from_obj in feature_aggregations)
#
# ***NOTE*** These values were selected in order to
# simplify things when you start a new project.
# ****Please change them as appropriate for your project*****
#
# Another change is that if your label span is the same in
# testing and training, instead of using training_label_timespans
# and test_label_timespans you could use label_timespans and
# triage will take care of it.
temporal_config:
feature_start_time: '1995-01-01' # earliest date included in features
feature_end_time: '2015-01-01' # latest date included in features
label_start_time: '2012-01-01' # earliest date for which labels are avialable
label_end_time: '2015-01-01' # day AFTER last label date (all dates in any model are < this date)
model_update_frequency: '6month' # how frequently to retrain models
training_as_of_date_frequencies: '1day' # time between as of dates for same entity in train matrix
test_as_of_date_frequencies: '3month' # time between as of dates for same entity in test matrix
max_training_histories: ['6month', '3month'] # length of time included in a train matrix
test_durations: ['0day', '1month', '2month'] # length of time included in a test matrix (0 days will give a single prediction immediately after training end)
training_label_timespans: ['1month'] # time period across which outcomes are labeled in train matrices
test_label_timespans: ['7day'] # time period across which outcomes are labeled in test matrices
#label_timespans: ['7day'] # you can use this if your train and test label timespans are the same (which is typical)
# COHORT CONFIG
# [OPTIONAL: You can define cohorts and labels separately or you can have a combined cohorts and labels query
# in the LABEL GENERATION section below]
#
# Cohorts are configured by passing a query with a para for the 'as_of_date'.
# The SQL query you specify here takes one parameter {as_of_date} and returns one column named entity_id
# which is the set of all entity_ids that make up the cohort as of {as_of_date}
#
# You may pass a relative filepath to a cohort query to the 'filepath' key (preferred) or use the 'query' key to include the a cohort query directly in the config
# Cohort queries should return a column named 'entity_id' and be parameterized with an '{as_of_date}', to select the entity_ids that should be included for a given date. The {as_of_date} will be replaced with each as_of_date that the experiment needs. The returned 'entity_id' must be an integer.
#
# You may enter a 'name' for your configuration.
# This will be included in the metadata for each matrix and used to group models
# If you don't pass one, the string 'default' will be used.
#
# This block is completely optional. If you don't specify it
# it will default to all of the entities for which there are labels.
# The name by default in this case will be 'all_entities'
cohort_config:
filepath: 'example/cohort/past_events.sql'
name: 'past_events'
# LABEL GENERATION
# Labels are configured with a query with placeholders for the 'as_of_date' and 'label_timespan'.
# You can include a local path to a sql file containing the label query to the 'filepath' key (preferred) or include the query in the 'query' key
#
# The query must return two columns: entity_id and outcome, based on a given as_of_date and label_timespan.
# The as_of_date and label_timespan must be represented by parameters marked by curly brackets. The example below
# reproduces the inspection outcome boolean-or logic:
#
# In addition, you can configure what label is given to entities that are in the matrix
# (see 'cohort_config' section) but that do not show up in this label query.
# By default, these will show up as missing/null.
# However, passing the key 'include_missing_labels_in_train_as' allows you to pick True or False.
#
# In addition to these configuration options, you can pass a name to apply to the label configuration
# that will be present in matrix metadata for each matrix created by this experiment,
# under the 'label_name' key. The default label_name is 'outcome'.
label_config:
filepath: 'example/label/events.sql'
#include_missing_labels_in_train_as: False
#name: 'inspections'
# FEATURE GENERATION
# The aggregate features to generate for each train/test split
#
# Implemented using the collate component:
# https://github.com/dssg/triage/tree/master/src/triage/component/collate
# Most terminology here is taken directly from collate
#
# Each entry describes a collate.SpacetimeAggregation object, and the
# arguments needed to create it. Each of these entries controls
# the features from one source table.
#
# Rules specifying how to handle imputation of null values must be explicitly
# defined in your config file. These can be specified in two places: either
# within each feature or overall for each type of feature (aggregates_imputation,
# categoricals_imputation, array_categoricals_imputation). In either case, a rule must be given for
# each aggregation function (e.g., sum, max, avg, etc) used, or a catch-all
# can be specified with `all`. Aggregation function-specific rules will take
# precedence over the `all` rule and feature-specific rules will take
# precedence over the higher-level rules. Several examples are provided below.
#
# Available Imputation Rules:
# * mean: The average value of the feature (for SpacetimeAggregation the
# mean is taken within-date).
# * constant: Fill with a constant value from a required `value` parameter.
# * zero: Fill with zero.
# * zero_noflag: Fill with zero and don't create a missing flag
# * null_category: Only available for categorical features. Just flag null
# values with the null category column.
# * binary_mode: Only available for aggregate column types. Takes the modal
# value for a binary feature.
# * error: Raise an exception if any null values are encountered for this
# feature.
feature_aggregations:
-
# prefix given to the feature tables created
prefix: 'prefix'
# from_obj is usually a source table but can be an expression, such as
# a join (ie 'cool_stuff join other_stuff using (stuff_id)')
from_obj: 'cool_stuff'
# The date column to use for specifying which records to include
# in temporal features. It is important that the column used specifies
# the date at which the event is known about, which may be different
# from the date the event happened.
knowledge_date_column: 'open_date'
# top-level imputation rules that will apply to all aggregates functions
# can also specify categoricals_imputation or array_categoricals_imputation
#
# You must specify at least one of the top-level or feature-level imputation
# to cover every feature being defined.
aggregates_imputation:
# The `all` rule will apply to all aggregation functions, unless over-
# ridden by a more specific one
all:
# every imputation rule must have a `type` parameter, while some
# (like 'constant') have other required parameters (`value` here)
type: 'constant'
value: 0
# specifying `max` here will take precedence over the `all` rule for
# aggregations using a MAX() function
max:
type: 'mean'
# aggregates and categoricals define the actual features created. So
# at least one is required
#
# Aggregates of numerical columns. Each quantity is a number of some
# sort, and the list of metrics are applied to each quantity
aggregates:
-
quantity: 'homeless::INT'
# Imputation rules specified at the level of specific features
# will take precedence over the higer-level rules specified
# above. Note that the 'count' and 'sum' metrics will be
# imputed differently here.
imputation:
count:
type: 'mean'
sum:
type: 'constant'
value: 137
metrics:
- 'count'
- 'sum'
coltype: 'smallint' # Optional, if you want to control the column type in the generated features tables
-
# since we're specifying `aggregates_imputation` above,
# a feature-specific imputation rule can be omitted
quantity: 'some_flag'
metrics:
- 'max'
- 'sum'
# Categorical features. The column given can be of any type, but the
# choices must be compatible with that type for equality within SQL
# The result will be one feature for each choice/metric combination
categoricals:
-
column: 'color'
# note that we haven't specified a top level `categoricals_imputation`
# set of rules, so we have to include feature-specific imputation
# rules for both of our categoricals here.
imputation:
sum:
type: 'null_category'
max:
type: 'mean'
choices:
- 'red'
- 'blue'
- 'green'
metrics:
- 'sum'
-
column: 'shape'
# as with the top-level imputation rules, `all` can be used
# for the feature-level rules to specify the same type of
# imputation for all aggregation functions
imputation:
all:
type: 'zero'
choice_query: 'select distinct shape from cool_stuff'
metrics:
- 'sum'
# The time intervals over which to aggregate features
intervals:
- '1 year'
- '2 years'
- 'all'
# FEATURE GROUPING
# define how to group features and generate combinations
# feature_group_definition allows you to create groups/subset of your features
# by different criteria.
# for instance,
# - 'tables' allows you to send a list of collate feature tables (collate builds these by appending 'aggregation_imputed' to the prefix)
# - 'prefix' allows you to specify a list of feature name prefixes
#
# This block is optional. If you don't specify it, it will default
# to all the 'prefix' in the 'features_aggregation' block.
feature_group_definition:
tables: ['prefix_aggregation_imputed']
# Strategies for generating combinations of groups
# available: all, leave-one-out, leave-one-in, all-combinations
# If not present, this will set to 'all'
feature_group_strategies: ['all']
# USER METADATA
# These are arbitrary keys/values that you can have Triage apply to the
# metadata for every matrix in the experiment. Any keys you include here can
# be used in the 'model_group_keys' below. For example, if you run the entity
# matching algorithm before running the experiment, one can include a `matchdatetime`
# in the `user_metadata` that signifies which set of `entity_id`s was used
# to create a given model/model group. That way, we don't accidentally match
# old predictions with new staged data without looking up the new `entity_id`s first.
user_metadata:
'matchdatetime': '2019-01-01'
# MODEL GROUPING (optional)
# Model groups are a way of partitioning trained models in a way that makes for easier analysis.
#
# model_group_keys defines a list of training matrix metadata and classifier keys that
# should be considered when creating a model group.
#
# There is an extensive default configuration, which is aimed at producing groups whose
# constituent models are equivalent to each other in all ways except for when they were trained.
# This makes the analysis of model stability easier.
#
# To accomplish this, the following default keys are used:
# 'class_path', 'parameters'
# 'feature_names', 'feature_groups', 'cohort_name', 'state'
# 'label_name', 'label_timespan', 'as_of_date_frequency', 'max_training_history'
#
# If you want to override this list, you can supply a 'model_group_keys' value.
# All of the defaults are available, along with some other temporal information
# that could be useful for more specialized analyses:
#
# 'first_as_of_time', 'last_as_of_time', 'matrix_info_end_time', 'as_of_times', 'feature_start_time'
#
# You can also use any pieces of user_metadata that you included in this experiment definition,
# as they will be present in the matrix metadata.
#
# model_group_keys: ['feature_groups', 'label_definition']
# BIAS AUDIT (optional)
# Every evaluation will include a bias audit (using the Aequitas toolkit).
# To run the bias audit it is necessary to define the protected groups by defining attributes (e.g. race) for every entity
# from_obj parameter: it can be a table name or a query (such as with features generators)
# The from_obj is expected to have the protected attributes in a single table with a entity_id and knowledge date column
# Triage will use the most recent entry of the source table with date < than current as_of_date as the value for those attributes in a given as of date
#
# thresholds:
# Running the bias audit might be slow, so the user should specify which thresholds should be used for the bias audit.
# Percentiles *should be between 0 and 100*, much like Triage model scoring config.
# People familiar with Aequitas may note that this differs from its expected values - Triage will do the conversion for Aequitas.
#
# ref groups:
# Please have a look to Aequitas documentation for further information about the ref_groups_method
# https://dssg.github.io/aequitas/config.html
# By default uses the min_metric, meaning for each bias metric it uses as reference the group with minimum metric value (e.g. the group defined by race that has the lower FPR)
# Alternatively it can be 'majority' (picks the largest group to serve as reference) or 'predefined' (needs a list of key values, see below)
bias_audit_config:
from_obj_table: 'semantic.demographics'
attribute_columns: ['race', 'sex','age']
knowledge_date_column: 'event_date'
entity_id_column: 'person_id'
ref_groups_method: 'predefined'
ref_groups:
'race': 'w'
'sex': 'm'
'age': '35-49'
thresholds:
percentiles: [1]
top_n: [100]
# PREDICTION
# How predictions are computed for train and test matrices
#
# Rank tiebreaking - In the predictions.rank_abs and rank_pct columns, ties in the score
# are broken either at random or based on the 'worst' or 'best' options. 'worst' is the default.
#
# 'worst' will break ties with the ascending label value, so if you
# take the top 'k' predictions, and there are ties across the 'k'
# threshold, the predictions above the threshold will be negative
# labels if possible. NULL labels are also pushed to the top.
# 'best' will break ties with the descending label value, so if you
# take the top 'k' predictions, and there are ties across the 'k'
# threshold, the predictions above the threshold will be positive
# labels if possible. NULL labels are pushed to the bottom.
# 'random' will choose one random ordering to break ties. The result will be affected by
# current state of Postgres' random number generator. Before ranking, the generator is seeded
# based on the *model*'s random seed.
#prediction:
# rank_tiebreaker: "worst"
# MODEL SCORING
# How each trained model is scored
#
# Each entry in 'testing_metric_groups' needs a list of one of the metrics defined in
# catwalk.evaluation.ModelEvaluator.available_metrics (contributions welcome!)
# Depending on the metric, either thresholds or parameters
#
# Parameters specify any hyperparameters needed. For most metrics,
# which are simply wrappers of sklearn functions, these
# are passed directly to sklearn.
#
# Thresholds are more specific: The list is dichotomized and only the
# top percentile or top n entities are scored as positive labels
#
# subsets, if passed, will add evaluations for subset(s) of the predictions to
# the subset_evaluations tables, using the same testing and training metric
# groups as used for overall evaluations but with any thresholds reapplied only
# to entities in the subset on the relevant as_of_dates. For example, when
# calculating precision@5_pct for the subset of women, the ModelEvaluator will
# count as positively labeled the top 5% of women, rather than any women in the
# top 5% overall. This is useful if, for example, different interventions will
# be applied to different subsets of entities (e.g., one program will provide
# subsidies to the top 500 women with children and another program will provide
# shelter to the top 150 women without children) and you would like to see
# whether a single model can be used for both applications. Subsets can also be
# used to see how a model's performance would be affected if the requirements
# for intervention eligibility became more restricted.
#
# Subsets should be a list of dictionaries with the following keys:
# - "name": a shorthand name for the subset
# - "query": a query that returns distinct entity_ids belonging to the
# subset on a given as_of_date with a placeholder for the
# as_of_date being queried
scoring:
testing_metric_groups:
-
metrics: [precision@, recall@]
thresholds:
percentiles: [5.0, 10.0]
top_n: [5, 10]
-
metrics: [f1]
-
metrics: [fbeta@]
parameters:
-
beta: 0.75
-
beta: 1.25
training_metric_groups:
-
metrics: [accuracy]
subsets:
-
name: women
query: |
select distinct entity_id
from demographics
where d.gender = 'woman'
and demographic_date < '{as_of_date}'::date
# INDIVIDUAL IMPORTANCES
# How feature importances for individuals should be computed
# There are two variables here:
# methods: Refer to *how to compute* individual importances.
# Each entry in this list should represent a different method.
# Available methods are in the catwalk library's:
# `catwalk.individual_importance.CALCULATE_STRATEGIES` list
# Will default to 'uniform', or just the global importances.
#
# n_ranks: The number of top features per individual to compute importances for
# Will default to 5
#
# This entire section can be left blank,
# in which case the defaults will be used.
individual_importance:
methods: [] # empty list means don't calculate individual importances
# methods: ['uniform']
n_ranks: 5
# MODEL GRID PRESETS
# Triage now comes with a set of predefined *recommended* grids
# named: quickstart, small, medium, large
# See the documentation for recommended uses cases for those.
#
# If you set this configuration section you DO NOT NEED TO include the grid_config section.
# However, you MAY optionally include both sections if you want to add additional model types
# to the preset grid. Generally, this is most useful for including additional commonsense
# baselines that are specific to your project.
#
# model_grid_preset: 'small'
# GRID CONFIGURATION
# The classifier/hyperparameter combinations that should be trained
#
# Each top-level key should be a class name, importable from triage. sklearn is
# available, and if you have another classifier package you would like available,
# contribute it to requirement/main.txt
#
# Each lower-level key is a hyperparameter name for the given classifier, and
# each value is a list of potential values. All possible combinations of
# classifiers and hyperparameters are trained.
#
# NOTE: Triage now include a new parameter named 'model_grid_preset' (see above)
# If you include both, triage will combine the unique model specifications across
# the preset and user-specified grids.
#
grid_config:
'sklearn.ensemble.ExtraTreesClassifier':
n_estimators: [100,100]
criterion: [gini, entropy]
max_depth: [1,5,10,20,50]
max_features: [sqrt,log2]
min_samples_split: [2,5,10]
# catwalk's custom model wrapper ScaledLogisticRegression will
# automatically scale the train data prior to fitting the model
# and then use the same scaling for the test data
'triage.component.catwalk.estimators.classifiers.ScaledLogisticRegression':
penalty: ['l1', 'l2']
solver: 'lbfgs'
C: [0.01, 1]
# catwalk's PercentileRankOneFeature baseline will score each entity as
# as its percentile on the named feature. This is useful for comparing
# predictive models to simply ranking entities on a single feature.
'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
feature: ['feature_one', 'feature_two']
low_value_high_score: [True]
# catwalk's BaselineRankMultiFeature baseline will score based on the ranking
# by one or more feature (note that the scores don't map to the percentiles as
# in PercentileRankOneFeature. This provides a slightly more complex baseline
# than above, but still realistic for what might be encountered in practice.
# The example below will create two ranker "models": one ranking by two features
# and the other just by a single feature. Note that the rules are lists of
# dictionaries.
'triage.component.catwalk.baselines.rankers.BaselineRankMultiFeature':
rules:
- [{feature: 'feature_1', low_value_high_score: True}, {feature: 'feature_2', low_value_high_score: False}]
- [{feature: 'feature_3', low_value_high_score: True}]
# catwalk's SimpleThresholder baseline will evaluate each entity against
# a list of rules and classify entities as 1 based on whether they meet
# any or all of these rules, depending on whether 'or' or 'and' is
# passed as the logical_operator. This is useful for comparing
# predictive modeling against simple rule-based classification.
'triage.component.catwalk.baselines.thresholders.SimpleThresholder':
rules: [['feature_one > 3', 'feature_two <= 5']]
logical_operator: 'and'