Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[speechx] set nnet param by flags #1769

Merged
merged 2 commits into from
Apr 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ DEFINE_string(
DEFINE_string(model_output_names,
"softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
"model output names");
DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box",
"model cache names");
DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");

using kaldi::BaseFloat;
using kaldi::Matrix;
Expand Down Expand Up @@ -77,7 +80,8 @@ int main(int argc, char* argv[]) {
ppspeech::ModelOptions model_opts;
model_opts.model_path = model_path;
model_opts.param_path = model_params;
model_opts.cache_shape = FLAGS_model_cache_names;
model_opts.cache_names = FLAGS_model_cache_names;
model_opts.cache_shape = FLAGS_model_cache_shapes;
model_opts.input_names = FLAGS_model_input_names;
model_opts.output_names = FLAGS_model_output_names;
std::shared_ptr<ppspeech::PaddleNnet> nnet(
Expand Down
8 changes: 6 additions & 2 deletions speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@ DEFINE_string(
DEFINE_string(model_output_names,
"softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
"model output names");
DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box",
"model cache names");
DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");

using kaldi::BaseFloat;
using kaldi::Matrix;
Expand Down Expand Up @@ -80,7 +83,8 @@ int main(int argc, char* argv[]) {
ppspeech::ModelOptions model_opts;
model_opts.model_path = model_graph;
model_opts.param_path = model_params;
model_opts.cache_shape = FLAGS_model_cache_names;
model_opts.cache_names = FLAGS_model_cache_names;
model_opts.cache_shape = FLAGS_model_cache_shapes;
model_opts.input_names = FLAGS_model_input_names;
model_opts.output_names = FLAGS_model_output_names;
std::shared_ptr<ppspeech::PaddleNnet> nnet(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ int main(int argc, char* argv[]) {

int32 num_done = 0, num_err = 0;

// feature pipeline: wave cache --> hanning
// window -->linear_spectrogram --> global cmvn -> feat cache
// feature pipeline: wave cache --> hanning window
// -->linear_spectrogram --> global cmvn -> feat cache

std::unique_ptr<ppspeech::FrontendInterface> data_source(
new ppspeech::AudioCache(3600 * 1600, true));
Expand Down
Empty file.
9 changes: 7 additions & 2 deletions speechx/speechx/decoder/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ DEFINE_string(
DEFINE_string(model_output_names,
"softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
"model output names");
DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box",
"model cache names");
DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");


namespace ppspeech {
Expand All @@ -70,7 +73,9 @@ ModelOptions InitModelOptions() {
ModelOptions model_opts;
model_opts.model_path = FLAGS_model_path;
model_opts.param_path = FLAGS_param_path;
model_opts.cache_shape = FLAGS_model_cache_names;
model_opts.cache_names = FLAGS_model_cache_names;
model_opts.cache_shape = FLAGS_model_cache_shapes;
model_opts.input_names = FLAGS_model_input_names;
model_opts.output_names = FLAGS_model_output_names;
return model_opts;
}
Expand Down
4 changes: 2 additions & 2 deletions speechx/speechx/frontend/audio/audio_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace ppspeech {
class AudioCache : public FrontendInterface {
public:
explicit AudioCache(int buffer_size = 1000 * kint16max,
bool to_float32 = true);
bool to_float32 = false);

virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);

Expand Down Expand Up @@ -58,7 +58,7 @@ class AudioCache : public FrontendInterface {
std::mutex mutex_;
std::condition_variable ready_feed_condition_;
kaldi::int32 timeout_; // millisecond
bool to_float32_;
bool to_float32_; // int16 -> float32. used in linear_spectrogram

DISALLOW_COPY_AND_ASSIGN(AudioCache);
};
Expand Down
11 changes: 8 additions & 3 deletions speechx/speechx/frontend/audio/feature_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
usleep(100); // sleep 0.1 ms
}
if (cache_.empty()) return false;

// read from cache
feats->Resize(cache_.front().Dim());
feats->CopyFromVec(cache_.front());
cache_.pop();
Expand All @@ -74,15 +76,16 @@ bool FeatureCache::Compute() {

// join with remained
int32 joint_len = feature.Dim() + remained_feature_.Dim();
int32 num_chunk =
((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;

Vector<BaseFloat> joint_feature(joint_len);
joint_feature.Range(0, remained_feature_.Dim())
.CopyFromVec(remained_feature_);
joint_feature.Range(remained_feature_.Dim(), feature.Dim())
.CopyFromVec(feature);

// one by one, or stride with window
// controlled by frame_chunk_stride_ and frame_chunk_size_
int32 num_chunk =
((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
int32 start = chunk_idx * frame_chunk_stride_ * dim_;

Expand All @@ -101,6 +104,8 @@ bool FeatureCache::Compute() {
cache_.push(feature_chunk);
ready_read_condition_.notify_one();
}

// cache remained feats
int32 remained_feature_len =
joint_len - num_chunk * frame_chunk_stride_ * dim_;
remained_feature_.Resize(remained_feature_len);
Expand Down
2 changes: 1 addition & 1 deletion speechx/speechx/frontend/audio/mfcc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ using kaldi::Matrix;
using std::vector;

Mfcc::Mfcc(const MfccOptions& opts,
std::unique_ptr<FrontendInterface> base_extractor)
std::unique_ptr<FrontendInterface> base_extractor)
: opts_(opts),
computer_(opts.mfcc_opts),
window_function_(computer_.GetFrameOptions()) {
Expand Down
26 changes: 19 additions & 7 deletions speechx/speechx/nnet/paddle_nnet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
LOG(INFO) << "output names: " << opts.output_names;
vector<string> input_names_vec = absl::StrSplit(opts.input_names, ",");
vector<string> output_names_vec = absl::StrSplit(opts.output_names, ",");

paddle_infer::Predictor* predictor = GetPredictor();

std::vector<std::string> model_input_names = predictor->GetInputNames();
Expand All @@ -87,6 +88,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
for (size_t i = 0; i < output_names_vec.size(); i++) {
assert(output_names_vec[i] == model_output_names[i]);
}

ReleasePredictor(predictor);
InitCacheEncouts(opts);
}
Expand All @@ -95,6 +97,7 @@ void PaddleNnet::Reset() { InitCacheEncouts(opts_); }

paddle_infer::Predictor* PaddleNnet::GetPredictor() {
paddle_infer::Predictor* predictor = nullptr;

std::lock_guard<std::mutex> guard(pool_mutex);
int pred_id = 0;

Expand Down Expand Up @@ -144,15 +147,19 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
Vector<BaseFloat>* inferences,
int32* inference_dim) {
paddle_infer::Predictor* predictor = GetPredictor();

int feat_row = features.Dim() / feature_dim;

std::vector<std::string> input_names = predictor->GetInputNames();
std::vector<std::string> output_names = predictor->GetOutputNames();

// feed inputs
std::unique_ptr<paddle_infer::Tensor> input_tensor =
predictor->GetInputHandle(input_names[0]);
std::vector<int> INPUT_SHAPE = {1, feat_row, feature_dim};
input_tensor->Reshape(INPUT_SHAPE);
input_tensor->CopyFromCpu(features.Data());

std::unique_ptr<paddle_infer::Tensor> input_len =
predictor->GetInputHandle(input_names[1]);
std::vector<int> input_len_size = {1};
Expand All @@ -161,32 +168,36 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
audio_len.push_back(feat_row);
input_len->CopyFromCpu(audio_len.data());

std::unique_ptr<paddle_infer::Tensor> h_box =
std::unique_ptr<paddle_infer::Tensor> state_h =
predictor->GetInputHandle(input_names[2]);
shared_ptr<Tensor<BaseFloat>> h_cache = GetCacheEncoder(input_names[2]);
h_box->Reshape(h_cache->get_shape());
h_box->CopyFromCpu(h_cache->get_data().data());
std::unique_ptr<paddle_infer::Tensor> c_box =
state_h->Reshape(h_cache->get_shape());
state_h->CopyFromCpu(h_cache->get_data().data());

std::unique_ptr<paddle_infer::Tensor> state_c =
predictor->GetInputHandle(input_names[3]);
shared_ptr<Tensor<float>> c_cache = GetCacheEncoder(input_names[3]);
c_box->Reshape(c_cache->get_shape());
c_box->CopyFromCpu(c_cache->get_data().data());
state_c->Reshape(c_cache->get_shape());
state_c->CopyFromCpu(c_cache->get_data().data());

// forward
bool success = predictor->Run();

if (success == false) {
LOG(INFO) << "predictor run occurs error";
}

// fetch outpus
std::unique_ptr<paddle_infer::Tensor> h_out =
predictor->GetOutputHandle(output_names[2]);
assert(h_cache->get_shape() == h_out->shape());
h_out->CopyToCpu(h_cache->get_data().data());

std::unique_ptr<paddle_infer::Tensor> c_out =
predictor->GetOutputHandle(output_names[3]);
assert(c_cache->get_shape() == c_out->shape());
c_out->CopyToCpu(c_cache->get_data().data());

// get result
std::unique_ptr<paddle_infer::Tensor> output_tensor =
predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_tensor->shape();
Expand All @@ -195,6 +206,7 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
inferences->Resize(row * col);
*inference_dim = col;
output_tensor->CopyToCpu(inferences->Data());

ReleasePredictor(predictor);
}

Expand Down
33 changes: 16 additions & 17 deletions speechx/speechx/nnet/paddle_nnet.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace ppspeech {
struct ModelOptions {
std::string model_path;
std::string param_path;
int thread_num;
int thread_num; // predictor thread pool size
bool use_gpu;
bool switch_ir_optim;
std::string input_names;
Expand All @@ -34,19 +34,14 @@ struct ModelOptions {
bool enable_fc_padding;
bool enable_profile;
ModelOptions()
: model_path("avg_1.jit.pdmodel"),
param_path("avg_1.jit.pdiparams"),
: model_path(""),
param_path(""),
thread_num(2),
use_gpu(false),
input_names(
"audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_"
"box"),
output_names(
"save_infer_model/scale_0.tmp_1,save_infer_model/"
"scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
"scale_3.tmp_1"),
cache_names("chunk_state_h_box,chunk_state_c_box"),
cache_shape("3-1-1024,3-1-1024"),
input_names(""),
output_names(""),
cache_names(""),
cache_shape(""),
switch_ir_optim(false),
enable_fc_padding(false),
enable_profile(false) {}
Expand Down Expand Up @@ -76,17 +71,19 @@ class Tensor {
public:
Tensor() {}
Tensor(const std::vector<int>& shape) : _shape(shape) {
int data_size = std::accumulate(
int neml = std::accumulate(
_shape.begin(), _shape.end(), 1, std::multiplies<int>());
LOG(INFO) << "data size: " << data_size;
_data.resize(data_size, 0);
LOG(INFO) << "Tensor neml: " << neml;
_data.resize(neml, 0);
}

void reshape(const std::vector<int>& shape) {
_shape = shape;
int data_size = std::accumulate(
int neml = std::accumulate(
_shape.begin(), _shape.end(), 1, std::multiplies<int>());
_data.resize(data_size, 0);
_data.resize(neml, 0);
}

const std::vector<int>& get_shape() const { return _shape; }
std::vector<T>& get_data() { return _data; }

Expand All @@ -98,10 +95,12 @@ class Tensor {
class PaddleNnet : public NnetInterface {
public:
PaddleNnet(const ModelOptions& opts);

virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences,
int32* inference_dim);

void Dim();
virtual void Reset();
std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(
Expand Down