forked from lvapeab/nmt-keras
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
334 lines (279 loc) · 24 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def load_parameters():
"""
Loads the defined hyperparameters.
:return parameters: Dictionary of loaded parameters.
"""
# Input data params
TASK_NAME = 'EuTrans' # Task name.
DATASET_NAME = TASK_NAME # Dataset name.
SRC_LAN = 'es' # Language of the source text.
TRG_LAN = 'en' # Language of the target text.
DATA_ROOT_PATH = 'examples/%s/' % DATASET_NAME # Path where data is stored.
# SRC_LAN or TRG_LAN will be added to the file names.
TEXT_FILES = {'train': 'training.', # Data files.
'val': 'dev.',
'test': 'test.'}
GLOSSARY = None # Glossary location. If not None, it overwrites translations according to this glossary file
# Dataset class parameters
INPUTS_IDS_DATASET = ['source_text', 'state_below'] # Corresponding inputs of the dataset.
OUTPUTS_IDS_DATASET = ['target_text'] # Corresponding outputs of the dataset.
INPUTS_IDS_MODEL = ['source_text', 'state_below'] # Corresponding inputs of the built model.
OUTPUTS_IDS_MODEL = ['target_text'] # Corresponding outputs of the built model.
INPUTS_TYPES_DATASET = ['text-features', 'text-features'] # Corresponding types of the data. 'text' or 'text-features' allowed.
OUTPUTS_TYPES_DATASET = ['text-features'] # They are equivalent, only differ on how the data is loaded.
# Evaluation params
METRICS = ['coco'] # Metric used for evaluating the model.
EVAL_ON_SETS = ['val'] # Possible values: 'train', 'val' and 'test' (external evaluator).
EVAL_ON_SETS_KERAS = [] # Possible values: 'train', 'val' and 'test' (Keras' evaluator). Untested..
START_EVAL_ON_EPOCH = 1 # First epoch to start the model evaluation.
EVAL_EACH_EPOCHS = True # Select whether evaluate between N epochs or N updates.
EVAL_EACH = 1 # Sets the evaluation frequency (epochs or updates).
# Search parameters
SAMPLING = 'max_likelihood' # Possible values: multinomial or max_likelihood (recommended).
TEMPERATURE = 1 # Multinomial sampling parameter.
BEAM_SEARCH = True # Switches on-off the beam search procedure.
BEAM_SIZE = 6 # Beam size (in case of BEAM_SEARCH == True).
OPTIMIZED_SEARCH = True # Compute annotations only a single time per sample.
SEARCH_PRUNING = False # Apply pruning strategies to the beam search method.
# It will likely increase decoding speed, but decrease quality.
MAXLEN_GIVEN_X = True # Generate translations of similar length to the source sentences.
MAXLEN_GIVEN_X_FACTOR = 2 # The hypotheses will have (as maximum) the number of words of the
# source sentence * LENGTH_Y_GIVEN_X_FACTOR.
MINLEN_GIVEN_X = True # Generate translations of similar length to the source sentences.
MINLEN_GIVEN_X_FACTOR = 3 # The hypotheses will have (as minimum) the number of words of the
# source sentence / LENGTH_Y_GIVEN_X_FACTOR.
# Apply length and coverage decoding normalizations.
# See Section 7 from Wu et al. (2016) (https://arxiv.org/abs/1609.08144).
LENGTH_PENALTY = False # Apply length penalty.
LENGTH_NORM_FACTOR = 0.2 # Length penalty factor.
COVERAGE_PENALTY = False # Apply source coverage penalty.
COVERAGE_NORM_FACTOR = 0.2 # Coverage penalty factor.
# Alternative (simple) length normalization.
NORMALIZE_SAMPLING = False # Normalize hypotheses scores according to their length:
ALPHA_FACTOR = .6 # Normalization according to |h|**ALPHA_FACTOR.
# Sampling params: Show some samples during training.
SAMPLE_ON_SETS = ['train', 'val'] # Possible values: 'train', 'val' and 'test'.
N_SAMPLES = 5 # Number of samples generated.
START_SAMPLING_ON_EPOCH = 1 # First epoch where to start the sampling counter.
SAMPLE_EACH_UPDATES = 300 # Sampling frequency (always in #updates).
# Unknown words treatment
POS_UNK = True # Enable POS_UNK strategy for unknown words.
HEURISTIC = 0 # Heuristic to follow:
# 0: Replace the UNK by the correspondingly aligned source.
# 1: Replace the UNK by the translation (given by an external
# dictionary) of the correspondingly aligned source.
# 2: Replace the UNK by the translation (given by an external
# dictionary) of the correspondingly aligned source only if it
# starts with a lowercase. Otherwise, copies the source word.
ALIGN_FROM_RAW = True # Align using the full vocabulary or the short_list.
# Source -- Target pkl mapping (used for heuristics 1--2). See utils/build_mapping_file.sh for further info.
MAPPING = DATA_ROOT_PATH + '/mapping.%s_%s.pkl' % (SRC_LAN, TRG_LAN)
# Word representation params
TOKENIZATION_METHOD = 'tokenize_none' # Select which tokenization we'll apply.
# See Dataset class (from stager_keras_wrapper) for more info.
BPE_CODES_PATH = DATA_ROOT_PATH + '/training_codes.joint' # If TOKENIZATION_METHOD = 'tokenize_bpe',
# sets the path to the learned BPE codes.
DETOKENIZATION_METHOD = 'detokenize_none' # Select which de-tokenization method we'll apply.
APPLY_DETOKENIZATION = False # Wheter we apply a detokenization method.
TOKENIZE_HYPOTHESES = True # Whether we tokenize the hypotheses using the
# previously defined tokenization method.
TOKENIZE_REFERENCES = True # Whether we tokenize the references using the
# previously defined tokenization method.
# Input image parameters
DATA_AUGMENTATION = False # Apply data augmentation on input data (still unimplemented for text inputs).
# Text parameters
FILL = 'end' # Whether we pad the 'end', the 'start' of the sentence with 0s. We can also 'center' it.
PAD_ON_BATCH = True # Whether we take as many timesteps as the longest sequence of
# the batch or a fixed size (MAX_OUTPUT_TEXT_LEN).
# Input text parameters
INPUT_VOCABULARY_SIZE = 0 # Size of the input vocabulary. Set to 0 for using all,
# otherwise it will be truncated to these most frequent words.
MIN_OCCURRENCES_INPUT_VOCAB = 0 # Minimum number of occurrences allowed for the words in the input vocabulary.
# Set to 0 for using them all.
MAX_INPUT_TEXT_LEN = 50 # Maximum length of the input sequence.
# Output text parameters
OUTPUT_VOCABULARY_SIZE = 0 # Size of the input vocabulary. Set to 0 for using all,
# otherwise it will be truncated to these most frequent words.
MIN_OCCURRENCES_OUTPUT_VOCAB = 0 # Minimum number of occurrences allowed for the words in the output vocabulary.
MAX_OUTPUT_TEXT_LEN = 50 # Maximum length of the output sequence.
# set to 0 if we want to use the whole answer as a single class.
MAX_OUTPUT_TEXT_LEN_TEST = MAX_OUTPUT_TEXT_LEN * 3 # Maximum length of the output sequence during test time.
# Optimizer parameters (see model.compile() function).
LOSS = 'categorical_crossentropy'
CLASSIFIER_ACTIVATION = 'softmax'
SAMPLE_WEIGHTS = True # Select whether we use a weights matrix (mask) for the data outputs
LABEL_SMOOTHING = 0. # Epsilon value for label smoothing. Only valid for 'categorical_crossentropy' loss. See arxiv.org/abs/1512.00567.
OPTIMIZER = 'Adam' # Optimizer. Supported optimizers: SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam.
LR = 0.001 # Learning rate. Recommended values - Adam 0.0002 - Adadelta 1.0.
CLIP_C = 5. # During training, clip L2 norm of gradients to this value (0. means deactivated).
CLIP_V = 0. # During training, clip absolute value of gradients to this value (0. means deactivated).
USE_TF_OPTIMIZER = True # Use native Tensorflow's optimizer (only for the Tensorflow backend).
# Advanced parameters for optimizers. Default values are usually effective.
MOMENTUM = 0. # Momentum value (for SGD optimizer).
NESTEROV_MOMENTUM = False # Use Nesterov momentum (for SGD optimizer).
RHO = 0.9 # Rho value (for Adadelta and RMSprop optimizers).
BETA_1 = 0.9 # Beta 1 value (for Adam, Adamax Nadam optimizers).
BETA_2 = 0.999 # Beta 2 value (for Adam, Adamax Nadam optimizers).
AMSGRAD = False # Whether to apply the AMSGrad variant of Adam (see https://openreview.net/pdf?id=ryQu7f-RZ).
EPSILON = 1e-7 # Optimizers epsilon value.
ACCUMULATE_GRADIENTS = 1 # Accumulate gradients for this number of batches. Currently only implemented for Adam.
# Learning rate schedule
LR_DECAY = None # Frequency (number of epochs or updates) between LR annealings. Set to None for not decay the learning rate.
LR_GAMMA = 0.8 # Multiplier used for decreasing the LR.
LR_REDUCE_EACH_EPOCHS = False # Reduce each LR_DECAY number of epochs or updates.
LR_START_REDUCTION_ON_EPOCH = 0 # Epoch to start the reduction.
LR_REDUCER_TYPE = 'exponential' # Function to reduce. 'linear' and 'exponential' implemented.
# Linear reduction: new_lr = lr * LR_GAMMA
# Exponential reduction: new_lr = lr * LR_REDUCER_EXP_BASE ** (current_nb / LR_HALF_LIFE) * LR_GAMMA
# Noam reduction: new_lr = lr * min(current_nb ** LR_REDUCER_EXP_BASE, current_nb * LR_HALF_LIFE ** WARMUP_EXP)
LR_REDUCER_EXP_BASE = -0.5 # Base for the exponential decay.
LR_HALF_LIFE = 100 # Factor/warmup steps for exponenital/noam decay.
WARMUP_EXP = -1.5 # Warmup steps for noam decay.
MIN_LR = 1e-9 # Minimum value allowed for the decayed LR
# Training parameters
MAX_EPOCH = 500 # Stop when computed this number of epochs.
BATCH_SIZE = 50 # Size of each minibatch.
N_GPUS = 1 # Number of GPUs to use. Only for Tensorflow backend. Each GPU will receive mini-batches of BATCH_SIZE / N_GPUS.
HOMOGENEOUS_BATCHES = False # Use batches with homogeneous output lengths (Dangerous!!).
JOINT_BATCHES = 4 # When using homogeneous batches, get this number of batches to sort.
PARALLEL_LOADERS = 1 # Parallel data batch loaders. Somewhat untested if > 1.
EPOCHS_FOR_SAVE = 1 # Number of epochs between model saves.
WRITE_VALID_SAMPLES = True # Write valid samples in file.
SAVE_EACH_EVALUATION = True # Save each time we evaluate the model.
# Early stop parameters
EARLY_STOP = True # Turns on/off the early stop protocol.
PATIENCE = 10 # We'll stop if the val STOP_METRIC does not improve after this.
# number of evaluations.
STOP_METRIC = 'Bleu_4' # Metric for the stop.
# Model parameters
MODEL_TYPE = 'AttentionRNNEncoderDecoder' # Model to train. See model_zoo.py for more info.
# Supported architectures: 'AttentionRNNEncoderDecoder' and 'Transformer'.
# Hyperparameters common to all models
# # # # # # # # # # # # # # # # # # # # # # # #
TRAINABLE_ENCODER = True # Whether the encoder's weights should be modified during training.
TRAINABLE_DECODER = True # Whether the decoder's weights should be modified during training.
# Initializers (see keras/initializations.py).
INIT_FUNCTION = 'glorot_uniform' # General initialization function for matrices.
INNER_INIT = 'orthogonal' # Initialization function for inner RNN matrices.
INIT_ATT = 'glorot_uniform' # Initialization function for attention mechism matrices
SOURCE_TEXT_EMBEDDING_SIZE = 32 # Source language word embedding size.
SRC_PRETRAINED_VECTORS = None # Path to pretrained vectors (e.g.: DATA_ROOT_PATH + '/DATA/word2vec.%s.npy' % SRC_LAN).
# Set to None if you don't want to use pretrained vectors.
# When using pretrained word embeddings. this parameter must match with the word embeddings size
SRC_PRETRAINED_VECTORS_TRAINABLE = True # Finetune or not the target word embedding vectors.
TARGET_TEXT_EMBEDDING_SIZE = 32 # Source language word embedding size.
TRG_PRETRAINED_VECTORS = None # Path to pretrained vectors. (e.g. DATA_ROOT_PATH + '/DATA/word2vec.%s.npy' % TRG_LAN)
# Set to None if you don't want to use pretrained vectors.
# When using pretrained word embeddings, the size of the pretrained word embeddings must match with the word embeddings size.
TRG_PRETRAINED_VECTORS_TRAINABLE = True # Finetune or not the target word embedding vectors.
SCALE_SOURCE_WORD_EMBEDDINGS = False # Scale source word embeddings by Sqrt(SOURCE_TEXT_EMBEDDING_SIZE).
SCALE_TARGET_WORD_EMBEDDINGS = False # Scale target word embeddings by Sqrt(TARGET_TEXT_EMBEDDING_SIZE).
TIE_EMBEDDINGS = False # Use the same embeddings for source and target language.
N_LAYERS_ENCODER = 1 # Stack this number of encoding layers.
N_LAYERS_DECODER = 1 # Stack this number of decoding layers.
# Additional Fully-Connected layers applied before softmax.
# Here we should specify the activation function and the output dimension.
# (e.g DEEP_OUTPUT_LAYERS = [('tanh', 600), ('relu', 400), ('relu', 200)])
DEEP_OUTPUT_LAYERS = [('linear', TARGET_TEXT_EMBEDDING_SIZE)]
# # # # # # # # # # # # # # # # # # # # # # # #
# AttentionRNNEncoderDecoder model hyperparameters
# # # # # # # # # # # # # # # # # # # # # # # #
ENCODER_RNN_TYPE = 'LSTM' # Encoder's RNN unit type ('LSTM' and 'GRU' supported).
USE_CUDNN = True # Use CuDNN's implementation of GRU and LSTM (only for Tensorflow backend).
DECODER_RNN_TYPE = 'ConditionalLSTM' # Decoder's RNN unit type.
# ('LSTM', 'GRU', 'ConditionalLSTM' and 'ConditionalGRU' supported).
ATTENTION_MODE = 'add' # Attention mode. 'add' (Bahdanau-style), 'dot' (Luong-style) or 'scaled-dot'.
# Encoder configuration
ENCODER_HIDDEN_SIZE = 32 # For models with RNN encoder.
BIDIRECTIONAL_ENCODER = True # Use bidirectional encoder.
BIDIRECTIONAL_DEEP_ENCODER = True # Use bidirectional encoder in all encoding layers.
BIDIRECTIONAL_MERGE_MODE = 'concat' # Merge function for bidirectional layers.
# Fully-Connected layers for initializing the first decoder RNN state.
# Here we should only specify the activation function of each layer (as they have a potentially fixed size)
# (e.g INIT_LAYERS = ['tanh', 'relu'])
INIT_LAYERS = ['tanh']
# Decoder configuration
DECODER_HIDDEN_SIZE = 32 # For models with RNN decoder.
ATTENTION_SIZE = DECODER_HIDDEN_SIZE
# Skip connections parameters
SKIP_VECTORS_HIDDEN_SIZE = TARGET_TEXT_EMBEDDING_SIZE # Hidden size.
ADDITIONAL_OUTPUT_MERGE_MODE = 'Add' # Merge mode for the skip-connections (see keras.layers.merge.py).
SKIP_VECTORS_SHARED_ACTIVATION = 'tanh' # Activation for the skip vectors.
# # # # # # # # # # # # # # # # # # # # # # # #
# Transformer model hyperparameters
# # # # # # # # # # # # # # # # # # # # # # # #
MODEL_SIZE = 32 # Transformer model size (d_{model} in de paper).
MULTIHEAD_ATTENTION_ACTIVATION = 'linear' # Activation the input projections in the Multi-Head Attention blocks.
FF_SIZE = MODEL_SIZE * 4 # Size of the feed-forward layers of the Transformer model.
N_HEADS = 8 # Number of parallel attention layers of the Transformer model.
# # # # # # # # # # # # # # # # # # # # # # # #
# Regularizers
REGULARIZATION_FN = 'L2' # Regularization function. 'L1', 'L2' and 'L1_L2' supported.
WEIGHT_DECAY = 1e-4 # Regularization coefficient.
RECURRENT_WEIGHT_DECAY = 0. # Regularization coefficient in recurrent layers.
DROPOUT_P = 0. # Percentage of units to drop (0 means no dropout).
RECURRENT_INPUT_DROPOUT_P = 0. # Percentage of units to drop in input cells of recurrent layers.
RECURRENT_DROPOUT_P = 0. # Percentage of units to drop in recurrent layers.
ATTENTION_DROPOUT_P = 0. # Percentage of units to drop in attention layers (0 means no dropout).
USE_NOISE = False # Use gaussian noise during training.
NOISE_AMOUNT = 0.01 # Amount of noise.
USE_BATCH_NORMALIZATION = True # If True it is recommended to deactivate Dropout.
BATCH_NORMALIZATION_MODE = 1 # See documentation in Keras' BN.
USE_PRELU = False # use PReLU activations as regularizer.
USE_L1 = False # L1 normalization on the features.
USE_L2 = False # L2 normalization on the features.
DOUBLE_STOCHASTIC_ATTENTION_REG = 0.0 # Doubly stochastic attention (Eq. 14 from arXiv:1502.03044).
# Results plot and models storing parameters.
EXTRA_NAME = '' # This will be appended to the end of the model name.
if MODEL_TYPE == 'AttentionRNNEncoderDecoder':
MODEL_NAME = TASK_NAME + '_' + SRC_LAN + TRG_LAN + '_' + MODEL_TYPE + \
'_src_emb_' + str(SOURCE_TEXT_EMBEDDING_SIZE) + \
'_bidir_' + str(BIDIRECTIONAL_ENCODER) + \
'_enc_' + ENCODER_RNN_TYPE + '_' + str(ENCODER_HIDDEN_SIZE) + \
'_dec_' + DECODER_RNN_TYPE + '_' + str(DECODER_HIDDEN_SIZE) + \
'_deepout_' + '_'.join([layer[0] for layer in DEEP_OUTPUT_LAYERS]) + \
'_trg_emb_' + str(TARGET_TEXT_EMBEDDING_SIZE) + \
'_' + OPTIMIZER + '_' + str(LR)
elif MODEL_TYPE == 'Transformer':
MODEL_NAME = TASK_NAME + '_' + SRC_LAN + TRG_LAN + '_' + MODEL_TYPE + \
'_model_size_' + str(MODEL_SIZE) + \
'_ff_size_' + str(FF_SIZE) + \
'_num_heads_' + str(N_HEADS) + \
'_encoder_blocks_' + str(N_LAYERS_ENCODER) + \
'_decoder_blocks_' + str(N_LAYERS_DECODER) + \
'_deepout_' + '_'.join([layer[0] for layer in DEEP_OUTPUT_LAYERS]) + \
'_' + OPTIMIZER + '_' + str(LR)
else:
MODEL_NAME = TASK_NAME + '_' + SRC_LAN + TRG_LAN + '_' +\
MODEL_TYPE + '_' + OPTIMIZER + '_' + str(LR)
MODEL_NAME += EXTRA_NAME
STORE_PATH = 'trained_models/' + MODEL_NAME + '/' # Models and evaluation results will be stored here.
DATASET_STORE_PATH = 'datasets/' # Dataset instance will be stored here.
# Tensorboard configuration. Only if the backend is Tensorflow. Otherwise, it will be ignored.
TENSORBOARD = True # Switches On/Off the tensorboard callback.
LOG_DIR = 'tensorboard_logs' # Directory to store teh model. Will be created inside STORE_PATH.
EMBEDDINGS_FREQ = 1 # Frequency (in epochs) at which selected embedding layers will be saved.
EMBEDDINGS_LAYER_NAMES = [ # A list of names of layers to keep eye on. If None or empty list all the embedding layer will be watched.
'source_word_embedding',
'target_word_embedding']
EMBEDDINGS_METADATA = None # Dictionary which maps layer name to a file name in which metadata for this embedding layer is saved.
LABEL_WORD_EMBEDDINGS_WITH_VOCAB = True # Whether to use vocabularies as word embeddings labels (will overwrite EMBEDDINGS_METADATA).
WORD_EMBEDDINGS_LABELS = [ # Vocabularies for labeling. Must match EMBEDDINGS_LAYER_NAMES.
'source_text',
'target_text']
SAMPLING_SAVE_MODE = 'list' # 'list': Store in a text file, one sentence per line.
PLOT_EVALUATION = False # If True, the evaluation will be plotted into the model folder.
VERBOSE = 1 # Verbosity level.
RELOAD = 0 # If 0 start training from scratch, otherwise the model.
# Saved on epoch 'RELOAD' will be used.
RELOAD_EPOCH = True # Select whether we reload epoch or update number.
REBUILD_DATASET = True # Build again or use stored instance.
MODE = 'training' # 'training' or 'sampling' (if 'sampling' then RELOAD must
# be greater than 0 and EVAL_ON_SETS will be used).
# Extra parameters for special trainings. In most cases, they should be set to `False`
TRAIN_ON_TRAINVAL = False # train the model on both training and validation sets combined.
FORCE_RELOAD_VOCABULARY = False # force building a new vocabulary from the training samples
# applicable if RELOAD > 1
# ================================================ #
parameters = locals().copy()
return parameters