PaddlePaddle · zh794390558 · Apr 24, 2022 · Apr 24, 2022 · Apr 24, 2022
diff --git a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
@@ -41,7 +41,10 @@ DEFINE_string(
 DEFINE_string(model_output_names,
               "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
               "model output names");
-DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -77,7 +80,8 @@ int main(int argc, char* argv[]) {
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = model_path;
     model_opts.param_path = model_params;
-    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
     model_opts.input_names = FLAGS_model_input_names;
     model_opts.output_names = FLAGS_model_output_names;
     std::shared_ptr<ppspeech::PaddleNnet> nnet(

diff --git a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
@@ -44,7 +44,10 @@ DEFINE_string(
 DEFINE_string(model_output_names,
               "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
               "model output names");
-DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -80,7 +83,8 @@ int main(int argc, char* argv[]) {
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = model_graph;
     model_opts.param_path = model_params;
-    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
     model_opts.input_names = FLAGS_model_input_names;
     model_opts.output_names = FLAGS_model_output_names;
     std::shared_ptr<ppspeech::PaddleNnet> nnet(

diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
@@ -42,8 +42,8 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    // feature pipeline: wave cache --> hanning
-    // window -->linear_spectrogram --> global cmvn -> feat cache
+    // feature pipeline: wave cache --> hanning window
+    // -->linear_spectrogram --> global cmvn -> feat cache
 
     std::unique_ptr<ppspeech::FrontendInterface> data_source(
         new ppspeech::AudioCache(3600 * 1600, true));

diff --git a/speechx/speechx/common/CMakeLists.txt b/speechx/speechx/common/CMakeLists.txt
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
@@ -43,7 +43,10 @@ DEFINE_string(
 DEFINE_string(model_output_names,
               "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
               "model output names");
-DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 
 
 namespace ppspeech {
@@ -70,7 +73,9 @@ ModelOptions InitModelOptions() {
     ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
     model_opts.param_path = FLAGS_param_path;
-    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
+    model_opts.input_names = FLAGS_model_input_names;
     model_opts.output_names = FLAGS_model_output_names;
     return model_opts;
 }

diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
@@ -24,7 +24,7 @@ namespace ppspeech {
 class AudioCache : public FrontendInterface {
   public:
     explicit AudioCache(int buffer_size = 1000 * kint16max,
-                        bool to_float32 = true);
+                        bool to_float32 = false);
 
     virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
 
@@ -58,7 +58,7 @@ class AudioCache : public FrontendInterface {
     std::mutex mutex_;
     std::condition_variable ready_feed_condition_;
     kaldi::int32 timeout_;  // millisecond
-    bool to_float32_;
+    bool to_float32_;       // int16 -> float32. used in linear_spectrogram
 
     DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };

diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -58,6 +58,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
         usleep(100);  // sleep 0.1 ms
     }
     if (cache_.empty()) return false;
+
+    // read from cache
     feats->Resize(cache_.front().Dim());
     feats->CopyFromVec(cache_.front());
     cache_.pop();
@@ -74,15 +76,16 @@ bool FeatureCache::Compute() {
 
     // join with remained
     int32 joint_len = feature.Dim() + remained_feature_.Dim();
-    int32 num_chunk =
-        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
-
     Vector<BaseFloat> joint_feature(joint_len);
     joint_feature.Range(0, remained_feature_.Dim())
         .CopyFromVec(remained_feature_);
     joint_feature.Range(remained_feature_.Dim(), feature.Dim())
         .CopyFromVec(feature);
 
+    // one by one, or stride with window
+    // controlled by frame_chunk_stride_ and frame_chunk_size_
+    int32 num_chunk =
+        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
     for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
         int32 start = chunk_idx * frame_chunk_stride_ * dim_;
 
@@ -101,6 +104,8 @@ bool FeatureCache::Compute() {
         cache_.push(feature_chunk);
         ready_read_condition_.notify_one();
     }
+
+    // cache remained feats
     int32 remained_feature_len =
         joint_len - num_chunk * frame_chunk_stride_ * dim_;
     remained_feature_.Resize(remained_feature_len);

diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/frontend/audio/mfcc.cc
@@ -30,7 +30,7 @@ using kaldi::Matrix;
 using std::vector;
 
 Mfcc::Mfcc(const MfccOptions& opts,
-             std::unique_ptr<FrontendInterface> base_extractor)
+           std::unique_ptr<FrontendInterface> base_extractor)
     : opts_(opts),
       computer_(opts.mfcc_opts),
       window_function_(computer_.GetFrameOptions()) {

diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/paddle_nnet.cc
@@ -74,6 +74,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
     LOG(INFO) << "output names: " << opts.output_names;
     vector<string> input_names_vec = absl::StrSplit(opts.input_names, ",");
     vector<string> output_names_vec = absl::StrSplit(opts.output_names, ",");
+
     paddle_infer::Predictor* predictor = GetPredictor();
 
     std::vector<std::string> model_input_names = predictor->GetInputNames();
@@ -87,6 +88,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
     for (size_t i = 0; i < output_names_vec.size(); i++) {
         assert(output_names_vec[i] == model_output_names[i]);
     }
+
     ReleasePredictor(predictor);
     InitCacheEncouts(opts);
 }
@@ -95,6 +97,7 @@ void PaddleNnet::Reset() { InitCacheEncouts(opts_); }
 
 paddle_infer::Predictor* PaddleNnet::GetPredictor() {
     paddle_infer::Predictor* predictor = nullptr;
+
     std::lock_guard<std::mutex> guard(pool_mutex);
     int pred_id = 0;
 
@@ -144,15 +147,19 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
                              Vector<BaseFloat>* inferences,
                              int32* inference_dim) {
     paddle_infer::Predictor* predictor = GetPredictor();
+
     int feat_row = features.Dim() / feature_dim;
+
     std::vector<std::string> input_names = predictor->GetInputNames();
     std::vector<std::string> output_names = predictor->GetOutputNames();
 
+    // feed inputs
     std::unique_ptr<paddle_infer::Tensor> input_tensor =
         predictor->GetInputHandle(input_names[0]);
     std::vector<int> INPUT_SHAPE = {1, feat_row, feature_dim};
     input_tensor->Reshape(INPUT_SHAPE);
     input_tensor->CopyFromCpu(features.Data());
+
     std::unique_ptr<paddle_infer::Tensor> input_len =
         predictor->GetInputHandle(input_names[1]);
     std::vector<int> input_len_size = {1};
@@ -161,32 +168,36 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
     audio_len.push_back(feat_row);
     input_len->CopyFromCpu(audio_len.data());
 
-    std::unique_ptr<paddle_infer::Tensor> h_box =
+    std::unique_ptr<paddle_infer::Tensor> state_h =
         predictor->GetInputHandle(input_names[2]);
     shared_ptr<Tensor<BaseFloat>> h_cache = GetCacheEncoder(input_names[2]);
-    h_box->Reshape(h_cache->get_shape());
-    h_box->CopyFromCpu(h_cache->get_data().data());
-    std::unique_ptr<paddle_infer::Tensor> c_box =
+    state_h->Reshape(h_cache->get_shape());
+    state_h->CopyFromCpu(h_cache->get_data().data());
+
+    std::unique_ptr<paddle_infer::Tensor> state_c =
         predictor->GetInputHandle(input_names[3]);
     shared_ptr<Tensor<float>> c_cache = GetCacheEncoder(input_names[3]);
-    c_box->Reshape(c_cache->get_shape());
-    c_box->CopyFromCpu(c_cache->get_data().data());
+    state_c->Reshape(c_cache->get_shape());
+    state_c->CopyFromCpu(c_cache->get_data().data());
+
+    // forward
     bool success = predictor->Run();
 
     if (success == false) {
         LOG(INFO) << "predictor run occurs error";
     }
 
+    // fetch outpus
     std::unique_ptr<paddle_infer::Tensor> h_out =
         predictor->GetOutputHandle(output_names[2]);
     assert(h_cache->get_shape() == h_out->shape());
     h_out->CopyToCpu(h_cache->get_data().data());
+
     std::unique_ptr<paddle_infer::Tensor> c_out =
         predictor->GetOutputHandle(output_names[3]);
     assert(c_cache->get_shape() == c_out->shape());
     c_out->CopyToCpu(c_cache->get_data().data());
 
-    // get result
     std::unique_ptr<paddle_infer::Tensor> output_tensor =
         predictor->GetOutputHandle(output_names[0]);
     std::vector<int> output_shape = output_tensor->shape();
@@ -195,6 +206,7 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
     inferences->Resize(row * col);
     *inference_dim = col;
     output_tensor->CopyToCpu(inferences->Data());
+
     ReleasePredictor(predictor);
 }
 

diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h
@@ -24,7 +24,7 @@ namespace ppspeech {
 struct ModelOptions {
     std::string model_path;
     std::string param_path;
-    int thread_num;
+    int thread_num;  // predictor thread pool size
     bool use_gpu;
     bool switch_ir_optim;
     std::string input_names;
@@ -34,19 +34,14 @@ struct ModelOptions {
     bool enable_fc_padding;
     bool enable_profile;
     ModelOptions()
-        : model_path("avg_1.jit.pdmodel"),
-          param_path("avg_1.jit.pdiparams"),
+        : model_path(""),
+          param_path(""),
           thread_num(2),
           use_gpu(false),
-          input_names(
-              "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_"
-              "box"),
-          output_names(
-              "save_infer_model/scale_0.tmp_1,save_infer_model/"
-              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
-              "scale_3.tmp_1"),
-          cache_names("chunk_state_h_box,chunk_state_c_box"),
-          cache_shape("3-1-1024,3-1-1024"),
+          input_names(""),
+          output_names(""),
+          cache_names(""),
+          cache_shape(""),
           switch_ir_optim(false),
           enable_fc_padding(false),
           enable_profile(false) {}
@@ -76,17 +71,19 @@ class Tensor {
   public:
     Tensor() {}
     Tensor(const std::vector<int>& shape) : _shape(shape) {
-        int data_size = std::accumulate(
+        int neml = std::accumulate(
             _shape.begin(), _shape.end(), 1, std::multiplies<int>());
-        LOG(INFO) << "data size: " << data_size;
-        _data.resize(data_size, 0);
+        LOG(INFO) << "Tensor neml: " << neml;
+        _data.resize(neml, 0);
     }
+
     void reshape(const std::vector<int>& shape) {
         _shape = shape;
-        int data_size = std::accumulate(
+        int neml = std::accumulate(
             _shape.begin(), _shape.end(), 1, std::multiplies<int>());
-        _data.resize(data_size, 0);
+        _data.resize(neml, 0);
     }
+
     const std::vector<int>& get_shape() const { return _shape; }
     std::vector<T>& get_data() { return _data; }
 
@@ -98,10 +95,12 @@ class Tensor {
 class PaddleNnet : public NnetInterface {
   public:
     PaddleNnet(const ModelOptions& opts);
+
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
                              int32 feature_dim,
                              kaldi::Vector<kaldi::BaseFloat>* inferences,
                              int32* inference_dim);
+
     void Dim();
     virtual void Reset();
     std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(