configs/config.yaml

generals:
  warn: False                          # show warnings
  save_folder: exps/new_exp            # saving directory
  savepsds: True                       # save psds data
  test_on_public_eval: False           # change test dataset to DESED Reak public_eval (default dataset is DESED Real Validation)

training:
  test_only: False                     # perform test without training, for the model saved in save_folder
  debug: False                         # debugging mode runs train/validation only 1 epoch, and automatically apply True for div_dataset
  div_dataset: False                   # divide datasets by div_ratio. for debugging purpose, to make train/valid/test through dataset faster
  div_ratio: 25
  seed: 21
  weak_split: 0.9                      # split the weak dataset so that "weak_split" of dataset is used to train and rest is used to validate
  n_epochs: 200                        # number of epochs to run
  n_epochs_warmup: 50                  # number of epochs used for exponential warmup
  val_thresholds: [0.5]                # thresholds used to compute f1 intersection in validation.
  n_test_thresholds: 50                # number of thresholds used to compute psds in test
  ema_factor: 0.999                    # ema factor for teacher model used in mean teacher model
  w_weak: 0.5                          # weight for weak classification cost
  w_cons_max: 2                        # max weight used for consistency loss
  w_weak_cons: 1                       # max weight for weak classification cost
  decode_weak_valid: 0                 # weak prediction masking on strong prediction,
  decode_weak_test: 1                  # 0: no weak prediction used, 1: weak prediction masking, 2: weak SED
  trainweak_only: False                # train the model without strong dataset
  afl_loss:                            # antisymmetrical focal loss. None if you wanna use BCE loss. if you want AFL, try [gamma, zeta]
  median_window: [5,11,5,5,5,67,61,49,5,17]   # length of median filter used to smooth prediction in inference

  #data augmentations
  mixup_rate: 1                        # rate at which mixup is applied on the training data
  mixup_type: soft                     # Soft mixup gives the ratio of the mix to the labels, hard mixup gives a 1 to every label present.
  time_mask_ratios: [ 5, 20 ]          # ratio of time masking application wrpt total time length. 1/20~1/5 of time frame will be masked
  transform:                           # hyperparameters for data augmentations that do not alter the label information.
    n_transform: 2                     # 0: no augmentation below is applied. 1: same augmentation below is applied on student/teacher model input. 2: different augmentations below is applied on student/teacher model input.
    choice: [ 1, 0, 0 ]                # apply the chosen data augmentations: [ FilterAugment, freq_mask, add_noise ]
    filter_db_range: [ -4.5, 6 ]       # db range of FilterAugment to be applied on each band
    filter_bands: [ 2, 5 ]             # range of frequency band number in FilterAugment
    filter_minimum_bandwidth: 4
    filter_type: step
    freq_mask_ratio: 16                # maximum ratio of freuqnecy masking range. max 1/16 of total frequnecy number will be masked
    noise_snrs: [ 35, 40 ]             # snr of original signal wrpt the noise added.

feature:
  n_mels: 128
  frame_length: 2048
  hop_length: 256
  n_window: 2048
  sample_rate: 16000
  f_min: 0
  f_max: 8000
  audio_max_len: 10
  sr: 16000
  net_subsample: 4

CRNN:
  n_class: 10
  n_RNN_cell: 256
  n_RNN_layer: 2
  rec_dropout: 0
  attention: class                      # time / class

  n_input_ch: 1
  activation: cg
  conv_dropout: 0.5
  kernel: [ 3, 3, 3, 3, 3, 3, 3 ]
  pad: [ 1, 1, 1, 1, 1, 1, 1 ]
  stride: [ 1, 1, 1, 1, 1, 1, 1 ]
  n_filt: [ 32, 64, 128, 256, 256, 256, 256 ]
  pooling: [ [ 2, 2 ], [ 2, 2 ], [ 1, 2 ], [ 1, 2 ], [ 1, 2 ], [ 1, 2 ], [ 1, 2 ] ]

  n_basis_kernels: 4
  DY_layers: [ 0, 1, 1, 1, 1, 1, 1 ]
  temperature: 31
  pool_dim: time                       # leave time: "freq", pool time "time", pool both: "both"


dataset:                               # change with your paths if different.
  # NOTE: if you have data in 44kHz only then synth_folder will be the path where
  # resampled data will be placed.
  weak_folder: "../datasets/dcase2021/dataset/audio/train/weak_16k/"
  weak_folder_44k: "../datasets/dcase2021/dataset/audio/train/weak/"
  weak_tsv: "../datasets/dcase2021/dataset/metadata/train/weak.tsv"
  unlabeled_folder: "../datasets/dcase2021/dataset/audio/train/unlabel_in_domain_16k/"
  unlabeled_folder_44k: "../datasets/dcase2021/dataset/audio/train/unlabel_in_domain/"
  test_folder: "../datasets/dcase2021/dataset/audio/validation/validation_16k/"
  test_folder_44k: "../datasets/dcase2021/dataset/audio/validation/validation/"
  test_tsv: "../datasets/dcase2021/dataset/metadata/validation/validation.tsv"
  test_dur: "../datasets/dcase2021/dataset/metadata/validation/validation_durations.tsv"
  pubeval_folder: "../datasets/dcase2021/dataset/audio/eval/public_16k/"
  pubeval_folder_44k: "../datasets/dcase2021/dataset/audio/eval/public/"
  pubeval_tsv: "../datasets/dcase2021/dataset/metadata/eval/public.tsv"
  pubeval_dur: "../datasets/dcase2021/dataset/metadata/eval/public_durations.tsv"
synth_dataset:                         # change with your paths if different.
  synth_train_folder: "../datasets/dcase2021/dataset/audio/train/synthetic21_train/soundscapes_16k/"
  synth_train_folder_44k: "../datasets/dcase2021/dataset/audio/train/synthetic21_train/soundscapes/"
  synth_train_tsv: "../datasets/dcase2021/dataset/metadata/train/synthetic21_train/soundscapes.tsv"
  synth_train_dur: "../datasets/dcase2021/dataset/metadata/train/synthetic21_train/durations.tsv"
  synth_val_folder: "../datasets/dcase2021/dataset/audio/validation/synthetic21_validation/soundscapes_16k/"
  synth_val_folder_44k: "../datasets/dcase2021/dataset/audio/validation/synthetic21_validation/soundscapes/"
  synth_val_tsv: "../datasets/dcase2021/dataset/metadata/validation/synthetic21_validation/soundscapes.tsv"
  synth_val_dur: "../datasets/dcase2021/dataset/metadata/validation/synthetic21_validation/durations.tsv"
scaler:
  statistic: instance                  # instance or dataset-wide statistic
  normtype: minmax                     # minmax or standard or mean normalization
  dims: [ 0, 2 ]                       # dimensions over which normalization is applied
  savepath: ./scaler.ckpt              # path to scaler checkpoint
opt:
  lr: 0.001