PaddlePaddle · Jackwaterveg · Mar 22, 2022 · Mar 17, 2022 · Mar 22, 2022 · Mar 22, 2022
diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
@@ -37,6 +37,7 @@ model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
+    init_type: 'kaiming_uniform' 
 
 ###########################################
 #                   Data                  #

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
@@ -21,6 +21,7 @@
 from paddle.fluid import core
 from paddle.nn import functional as F
 
+from paddlespeech.s2t.modules import initializer
 from paddlespeech.s2t.utils.log import Log
 
 #TODO(Hui Zhang): remove  fluid import
@@ -505,3 +506,8 @@ def update(self, modules: Mapping[str, Layer]) -> None:
     logger.debug(
         "register user LayerDict to paddle.nn, remove this when fixed!")
     setattr(paddle.nn, 'LayerDict', LayerDict)
+
+"""
+    hack KaiminigUniform: change limit from np.sqrt(6.0 / float(fan_in)) to np.sqrt(1.0 / float(fan_in))
+"""
+paddle.nn.initializer.KaimingUniform = initializer.KaimingUniform
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
@@ -41,6 +41,7 @@
 from paddlespeech.s2t.modules.mask import mask_finished_preds
 from paddlespeech.s2t.modules.mask import mask_finished_scores
 from paddlespeech.s2t.modules.mask import subsequent_mask
+from paddlespeech.s2t.modules.nets_utils import initialize
 from paddlespeech.s2t.utils import checkpoint
 from paddlespeech.s2t.utils import layer_tools
 from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
@@ -72,6 +73,7 @@ def __init__(self,
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
 
         nn.Layer.__init__(self)
+
         # note that eos is the same as sos (equivalent ID)
         self.sos = vocab_size - 1
         self.eos = vocab_size - 1
@@ -780,9 +782,14 @@ def encode(self, x):
 
 class U2Model(U2DecodeModel):
     def __init__(self, configs: dict):
+        model_conf = configs.get('model_conf', dict())
+        init_type = model_conf.get("init_type", None)
+        if init_type is not None:
+            logger.info(f"Use {init_type} initializer as default initializer")
+        initialize(self, init_type)
         vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
+        nn.initializer.set_global_initializer(None)
 
-        model_conf = configs.get('model_conf', dict())
         super().__init__(
             vocab_size=vocab_size,
             encoder=encoder,

diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
@@ -95,7 +95,7 @@ def forward_attention(self,
             mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
                 (#batch, time1, time2).
         Returns:
-            paddle.Tensor: Transformed value weighted 
+            paddle.Tensor: Transformed value weighted
                 by the attention score, (#batch, time1, d_model).
         """
         n_batch = value.shape[0]

diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -60,8 +60,8 @@ def __init__(self,
         )
 
         # self.lorder is used to distinguish if it's a causal convolution,
-        # if self.lorder > 0: 
-        #    it's a causal convolution, the input will be padded with 
+        # if self.lorder > 0:
+        #    it's a causal convolution, the input will be padded with
         #    `self.lorder` frames on the left in forward (causal conv impl).
         # else: it's a symmetrical convolution
         if causal:
@@ -87,10 +87,20 @@ def __init__(self,
         assert norm in ['batch_norm', 'layer_norm']
         if norm == "batch_norm":
             self.use_layer_norm = False
-            self.norm = nn.BatchNorm1D(channels)
+            self.norm = nn.BatchNorm1D(
+                channels,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(1.0)),
+                bias_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(0.0)))
         else:
             self.use_layer_norm = True
-            self.norm = nn.LayerNorm(channels)
+            self.norm = nn.LayerNorm(
+                channels,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(1.0)),
+                bias_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(0.0)))
 
         self.pointwise_conv2 = nn.Conv1D(
             channels,

diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
@@ -76,19 +76,30 @@ def __init__(
             concat_after: bool=False, ):
 
         assert check_argument_types()
+
         nn.Layer.__init__(self)
         self.selfattention_layer_type = 'selfattn'
         attention_dim = encoder_output_size
 
         if input_layer == "embed":
             self.embed = nn.Sequential(
-                nn.Embedding(vocab_size, attention_dim),
+                nn.Embedding(
+                    vocab_size,
+                    attention_dim,
+                    weight_attr=paddle.ParamAttr(
+                        initializer=nn.initializer.Normal())),
                 PositionalEncoding(attention_dim, positional_dropout_rate), )
         else:
             raise ValueError(f"only 'embed' is supported: {input_layer}")
 
         self.normalize_before = normalize_before
-        self.after_norm = nn.LayerNorm(attention_dim, epsilon=1e-12)
+        self.after_norm = nn.LayerNorm(
+            attention_dim,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))
         self.use_output_layer = use_output_layer
         self.output_layer = nn.Linear(attention_dim, vocab_size)
 

diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
@@ -62,9 +62,27 @@ def __init__(
         self.self_attn = self_attn
         self.src_attn = src_attn
         self.feed_forward = feed_forward
-        self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
-        self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
-        self.norm3 = nn.LayerNorm(size, epsilon=1e-12)
+        self.norm1 = nn.LayerNorm(
+            size,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))
+        self.norm2 = nn.LayerNorm(
+            size,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))
+        self.norm3 = nn.LayerNorm(
+            size,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))
         self.dropout = nn.Dropout(dropout_rate)
         self.normalize_before = normalize_before
         self.concat_after = concat_after

diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
@@ -129,7 +129,13 @@ def __init__(
                 d_model=output_size, dropout_rate=positional_dropout_rate), )
 
         self.normalize_before = normalize_before
-        self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
+        self.after_norm = nn.LayerNorm(
+            output_size,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))
         self.static_chunk_size = static_chunk_size
         self.use_dynamic_chunk = use_dynamic_chunk
         self.use_dynamic_left_chunk = use_dynamic_left_chunk
@@ -457,6 +463,7 @@ def __init__(
             cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
         """
         assert check_argument_types()
+
         super().__init__(input_size, output_size, attention_heads, linear_units,
                          num_blocks, dropout_rate, positional_dropout_rate,
                          attention_dropout_rate, input_layer,

diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
@@ -39,7 +39,7 @@ def __init__(
             normalize_before: bool=True,
             concat_after: bool=False, ):
         """Construct an EncoderLayer object.
-        
+
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
@@ -147,7 +147,7 @@ def __init__(
             normalize_before: bool=True,
             concat_after: bool=False, ):
         """Construct an EncoderLayer object.
-        
+
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
@@ -174,18 +174,46 @@ def __init__(
         self.feed_forward = feed_forward
         self.feed_forward_macaron = feed_forward_macaron
         self.conv_module = conv_module
-        self.norm_ff = nn.LayerNorm(size, epsilon=1e-12)  # for the FNN module
-        self.norm_mha = nn.LayerNorm(size, epsilon=1e-12)  # for the MHA module
+        self.norm_ff = nn.LayerNorm(
+            size,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))  # for the FNN module
+        self.norm_mha = nn.LayerNorm(
+            size,
+            epsilon=1e-12,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0)),
+            bias_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0)))  # for the MHA module
         if feed_forward_macaron is not None:
-            self.norm_ff_macaron = nn.LayerNorm(size, epsilon=1e-12)
+            self.norm_ff_macaron = nn.LayerNorm(
+                size,
+                epsilon=1e-12,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(1.0)),
+                bias_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(0.0)))
             self.ff_scale = 0.5
         else:
             self.ff_scale = 1.0
         if self.conv_module is not None:
             self.norm_conv = nn.LayerNorm(
-                size, epsilon=1e-12)  # for the CNN module
+                size,
+                epsilon=1e-12,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(1.0)),
+                bias_attr=paddle.ParamAttr(initializer=nn.initializer.Constant(
+                    0.0)))  # for the CNN module
             self.norm_final = nn.LayerNorm(
-                size, epsilon=1e-12)  # for the final output of the block
+                size,
+                epsilon=1e-12,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Constant(1.0)),
+                bias_attr=paddle.ParamAttr(initializer=nn.initializer.Constant(
+                    0.0)))  # for the final output of the block
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before