-
Notifications
You must be signed in to change notification settings - Fork 10
/
config.yaml
108 lines (99 loc) · 6.44 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
generals:
warn: False # show warnings
save_folder: exps/new_exp # saving directory
savepsds: True # save psds data
test_on_public_eval: False # change test dataset to DESED Reak public_eval (default dataset is DESED Real Validation)
training:
test_only: False # perform test without training, for the model saved in save_folder
debug: False # debugging mode runs train/validation only 1 epoch, and automatically apply True for div_dataset
div_dataset: False # divide datasets by div_ratio. for debugging purpose, to make train/valid/test through dataset faster
div_ratio: 25
seed: 21
weak_split: 0.9 # split the weak dataset so that "weak_split" of dataset is used to train and rest is used to validate
n_epochs: 200 # number of epochs to run
n_epochs_warmup: 50 # number of epochs used for exponential warmup
val_thresholds: [0.5] # thresholds used to compute f1 intersection in validation.
n_test_thresholds: 50 # number of thresholds used to compute psds in test
ema_factor: 0.999 # ema factor for teacher model used in mean teacher model
w_weak: 0.5 # weight for weak classification cost
w_cons_max: 2 # max weight used for consistency loss
w_weak_cons: 1 # max weight for weak classification cost
decode_weak_valid: 0 # weak prediction masking on strong prediction,
decode_weak_test: 1 # 0: no weak prediction used, 1: weak prediction masking, 2: weak SED
trainweak_only: False # train the model without strong dataset
afl_loss: # antisymmetrical focal loss. None if you wanna use BCE loss. if you want AFL, try [gamma, zeta]
median_window: [5,11,5,5,5,67,61,49,5,17] # length of median filter used to smooth prediction in inference
#data augmentations
mixup_rate: 1 # rate at which mixup is applied on the training data
mixup_type: soft # Soft mixup gives the ratio of the mix to the labels, hard mixup gives a 1 to every label present.
time_mask_ratios: [ 5, 20 ] # ratio of time masking application wrpt total time length. 1/20~1/5 of time frame will be masked
transform: # hyperparameters for data augmentations that do not alter the label information.
n_transform: 2 # 0: no augmentation below is applied. 1: same augmentation below is applied on student/teacher model input. 2: different augmentations below is applied on student/teacher model input.
choice: [ 1, 0, 0 ] # apply the chosen data augmentations: [ FilterAugment, freq_mask, add_noise ]
filter_db_range: [ -4.5, 6 ] # db range of FilterAugment to be applied on each band
filter_bands: [ 2, 5 ] # range of frequency band number in FilterAugment
filter_minimum_bandwidth: 4
filter_type: step
freq_mask_ratio: 16 # maximum ratio of freuqnecy masking range. max 1/16 of total frequnecy number will be masked
noise_snrs: [ 35, 40 ] # snr of original signal wrpt the noise added.
feature:
n_mels: 128
frame_length: 2048
hop_length: 256
n_window: 2048
sample_rate: 16000
f_min: 0
f_max: 8000
audio_max_len: 10
sr: 16000
net_subsample: 4
CRNN:
n_class: 10
n_RNN_cell: 256
n_RNN_layer: 2
rec_dropout: 0
attention: class # time / class
n_input_ch: 1
activation: cg
conv_dropout: 0.5
kernel: [ 3, 3, 3, 3, 3, 3, 3 ]
pad: [ 1, 1, 1, 1, 1, 1, 1 ]
stride: [ 1, 1, 1, 1, 1, 1, 1 ]
n_filt: [ 32, 64, 128, 256, 256, 256, 256 ]
pooling: [ [ 2, 2 ], [ 2, 2 ], [ 1, 2 ], [ 1, 2 ], [ 1, 2 ], [ 1, 2 ], [ 1, 2 ] ]
n_basis_kernels: 4
DY_layers: [ 0, 1, 1, 1, 1, 1, 1 ]
temperature: 31
pool_dim: time # leave time: "freq", pool time "time", pool both: "both"
dataset: # change with your paths if different.
# NOTE: if you have data in 44kHz only then synth_folder will be the path where
# resampled data will be placed.
weak_folder: "../datasets/dcase2021/dataset/audio/train/weak_16k/"
weak_folder_44k: "../datasets/dcase2021/dataset/audio/train/weak/"
weak_tsv: "../datasets/dcase2021/dataset/metadata/train/weak.tsv"
unlabeled_folder: "../datasets/dcase2021/dataset/audio/train/unlabel_in_domain_16k/"
unlabeled_folder_44k: "../datasets/dcase2021/dataset/audio/train/unlabel_in_domain/"
test_folder: "../datasets/dcase2021/dataset/audio/validation/validation_16k/"
test_folder_44k: "../datasets/dcase2021/dataset/audio/validation/validation/"
test_tsv: "../datasets/dcase2021/dataset/metadata/validation/validation.tsv"
test_dur: "../datasets/dcase2021/dataset/metadata/validation/validation_durations.tsv"
pubeval_folder: "../datasets/dcase2021/dataset/audio/eval/public_16k/"
pubeval_folder_44k: "../datasets/dcase2021/dataset/audio/eval/public/"
pubeval_tsv: "../datasets/dcase2021/dataset/metadata/eval/public.tsv"
pubeval_dur: "../datasets/dcase2021/dataset/metadata/eval/public_durations.tsv"
synth_dataset: # change with your paths if different.
synth_train_folder: "../datasets/dcase2021/dataset/audio/train/synthetic21_train/soundscapes_16k/"
synth_train_folder_44k: "../datasets/dcase2021/dataset/audio/train/synthetic21_train/soundscapes/"
synth_train_tsv: "../datasets/dcase2021/dataset/metadata/train/synthetic21_train/soundscapes.tsv"
synth_train_dur: "../datasets/dcase2021/dataset/metadata/train/synthetic21_train/durations.tsv"
synth_val_folder: "../datasets/dcase2021/dataset/audio/validation/synthetic21_validation/soundscapes_16k/"
synth_val_folder_44k: "../datasets/dcase2021/dataset/audio/validation/synthetic21_validation/soundscapes/"
synth_val_tsv: "../datasets/dcase2021/dataset/metadata/validation/synthetic21_validation/soundscapes.tsv"
synth_val_dur: "../datasets/dcase2021/dataset/metadata/validation/synthetic21_validation/durations.tsv"
scaler:
statistic: instance # instance or dataset-wide statistic
normtype: minmax # minmax or standard or mean normalization
dims: [ 0, 2 ] # dimensions over which normalization is applied
savepath: ./scaler.ckpt # path to scaler checkpoint
opt:
lr: 0.001