From 676264dd3522e45873658aaa98ebec53eb2c3234 Mon Sep 17 00:00:00 2001 From: Agoniii <815244047@qq.com> Date: Mon, 18 Dec 2023 12:11:08 +0000 Subject: [PATCH] fix gated_linear_unit bug Signed-off-by: Agoniii <815244047@qq.com> --- .../nlp/models/language_modeling/megatron_base_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 99b0c81ac790..6de89b2ad83e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -346,6 +346,7 @@ def build_transformer_config(self) -> TransformerConfig: add_bias_linear = self.cfg.get('bias', True) activation = self.cfg.get('activation', 'gelu') + gated_linear_unit = activation.endswith('glu') # TODO: need to check which activation functions are supported in mcore activation_func = activation_to_func(activation) @@ -395,7 +396,7 @@ def build_transformer_config(self) -> TransformerConfig: 'apply_residual_connection_post_layernorm': False, # we don't use this in NeMo 'layernorm_zero_centered_gamma': False, 'add_bias_linear': add_bias_linear, - 'gated_linear_unit': False, + 'gated_linear_unit': gated_linear_unit, 'activation_func': activation_func, 'normalization': normalization, 'init_method': init_method,