wenet-e2e · Mddct · Apr 10, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ git clone https://github.com/wenet-e2e/wenet.git
 - Create Conda env:
 
 ``` sh
-conda create -n wenet python=3.8
+conda create -n wenet python=3.10
 conda activate wenet
 conda install conda-forge::sox
 pip install -r requirements.txt

diff --git a/requirements.txt b/requirements.txt
@@ -18,7 +18,7 @@ cpplint==1.6.1
 torch>=2.1.2
 torchaudio>=2.1.2
 tqdm
-deepspeed<0.13.0
+deepspeed>=0.14.0
 librosa
 openai-whisper
 pre-commit==3.5.0

diff --git a/tools/compute_cmvn_stats.py b/tools/compute_cmvn_stats.py
@@ -30,8 +30,7 @@ def __call__(self, batch):
             value = item[1].strip().split(",")
             assert len(value) == 3 or len(value) == 1
             wav_path = value[0]
-            sample_rate = torchaudio.info(
-                wav_path).sample_rate
+            sample_rate = torchaudio.info(wav_path).sample_rate
             resample_rate = sample_rate
             # len(value) == 3 means segmented wav.scp,
             # len(value) == 1 means original wav.scp

diff --git a/tools/wav2dur.py b/tools/wav2dur.py
@@ -5,7 +5,6 @@
 
 import torchaudio
 
-
 scp = sys.argv[1]
 dur_scp = sys.argv[2]
 

diff --git a/wenet/dataset/dataset.py b/wenet/dataset/dataset.py
@@ -66,7 +66,8 @@ def Dataset(data_type,
     dataset = dataset.map_ignore_error(processor.decode_wav)
 
     singal_channel_conf = conf.get('singal_channel_conf', {})
-    dataset = dataset.map(partial(processor.singal_channel, **singal_channel_conf))
+    dataset = dataset.map(
+        partial(processor.singal_channel, **singal_channel_conf))
 
     speaker_conf = conf.get('speaker_conf', None)
     if speaker_conf is not None:

diff --git a/wenet/dataset/processor.py b/wenet/dataset/processor.py
@@ -141,6 +141,7 @@ def decode_wav(sample):
     sample['sample_rate'] = sample_rate
     return sample
 
+
 def singal_channel(sample, channel=0):
     """ Choose a channel of sample.
         Inplace operation.

diff --git a/wenet/finetune/lora/attention.py b/wenet/finetune/lora/attention.py
@@ -35,6 +35,7 @@ class LoRAMultiHeadedAttention(MultiHeadedAttention):
         dropout_rate (float): Dropout rate.
 
     """
+
     def __init__(self,
                  n_head: int,
                  n_feat: int,
@@ -57,7 +58,10 @@ def __init__(self,
         self.d_k = n_feat // n_head
         self.h = n_head
         self.linear_out = lora.Linear(
-            n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha,
+            n_feat,
+            n_feat,
+            r=lora_rank,
+            lora_alpha=lora_alpha,
             lora_dropout=lora_dropout
         ) if lora_list and "o" in lora_list else nn.Linear(n_feat, n_feat)
 
@@ -69,12 +73,15 @@ def __init__(self,
         bias_dict = {"q": query_bias, "k": key_bias, "v": value_bias}
 
         for key, value in lora_qkv_dict.items():
-            setattr(self, f"linear_{key}",
-                    lora.Linear(n_feat, n_feat, r=lora_rank,
-                                lora_alpha=lora_alpha,
-                                lora_dropout=lora_dropout,
-                                bias=bias_dict[key])
-                    if value else nn.Linear(n_feat, n_feat, bias_dict[key]))
+            setattr(
+                self, f"linear_{key}",
+                lora.Linear(n_feat,
+                            n_feat,
+                            r=lora_rank,
+                            lora_alpha=lora_alpha,
+                            lora_dropout=lora_dropout,
+                            bias=bias_dict[key]) if value else nn.Linear(
+                                n_feat, n_feat, bias_dict[key]))
         self.dropout = nn.Dropout(p=dropout_rate)
 
 
@@ -87,6 +94,7 @@ class LoRARelPositionMultiHeadedAttention(LoRAMultiHeadedAttention,
         n_feat (int): The number of features.
         dropout_rate (float): Dropout rate.
     """
+
     def __init__(self,
                  n_head: int,
                  n_feat: int,

diff --git a/wenet/finetune/lora/encoder.py b/wenet/finetune/lora/encoder.py
@@ -86,15 +86,10 @@ def __init__(
         self.encoders = torch.nn.ModuleList([
             TransformerEncoderLayer(
                 output_size,
-                WENET_LORA_ATTENTION_CLASSES["selfattn"](attention_heads,
-                                                         output_size,
-                                                         attention_dropout_rate,
-                                                         query_bias, key_bias,
-                                                         value_bias, use_sdpa,
-                                                         n_kv_head, head_dim,
-                                                         lora_rank, lora_alpha,
-                                                         lora_dropout,
-                                                         lora_list),
+                WENET_LORA_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, output_size, attention_dropout_rate,
+                    query_bias, key_bias, value_bias, use_sdpa, n_kv_head,
+                    head_dim, lora_rank, lora_alpha, lora_dropout, lora_list),
                 mlp_class(output_size, linear_units, dropout_rate, activation,
                           mlp_bias),
                 dropout_rate,
@@ -167,18 +162,17 @@ def __init__(
             causal (bool): whether to use causal convolution or not.
             key_bias: whether use bias in attention.linear_k, False for whisper models.
         """
-        super().__init__(input_size, output_size, attention_heads,
-                         linear_units, num_blocks, dropout_rate,
-                         positional_dropout_rate, attention_dropout_rate,
-                         input_layer, pos_enc_layer_type, normalize_before,
-                         static_chunk_size, use_dynamic_chunk, global_cmvn,
-                         use_dynamic_left_chunk, positionwise_conv_kernel_size,
-                         macaron_style, selfattention_layer_type,
-                         activation_type, use_cnn_module, cnn_module_kernel,
-                         causal, cnn_module_norm, query_bias, key_bias,
-                         value_bias, mlp_bias, conv_bias,
-                         gradient_checkpointing, use_sdpa, mlp_type,
-                         layer_norm_type, norm_eps, n_kv_head, head_dim)
+        super().__init__(
+            input_size, output_size, attention_heads, linear_units, num_blocks,
+            dropout_rate, positional_dropout_rate, attention_dropout_rate,
+            input_layer, pos_enc_layer_type, normalize_before,
+            static_chunk_size, use_dynamic_chunk, global_cmvn,
+            use_dynamic_left_chunk, positionwise_conv_kernel_size,
+            macaron_style, selfattention_layer_type, activation_type,
+            use_cnn_module, cnn_module_kernel, causal, cnn_module_norm,
+            query_bias, key_bias, value_bias, mlp_bias, conv_bias,
+            gradient_checkpointing, use_sdpa, mlp_type, layer_norm_type,
+            norm_eps, n_kv_head, head_dim)
         activation = WENET_ACTIVATION_CLASSES[activation_type]()
 
         # self-attention module definition