Skip to content

Commit

Permalink
[runtime/xpu] 🐻Support the execution of non-streaming parsing on the …
Browse files Browse the repository at this point in the history
…Kunlun XPU card #1455
  • Loading branch information
panhehe committed Oct 27, 2022
1 parent 89e8d0d commit 70b5b69
Show file tree
Hide file tree
Showing 28 changed files with 3,462 additions and 6 deletions.
37 changes: 37 additions & 0 deletions runtime/core/cmake/xpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
if(NOT WIN32)
string(ASCII 27 Esc)
set(ColourReset "${Esc}[m")
set(ColourBold "${Esc}[1m")
set(Red "${Esc}[31m")
set(Green "${Esc}[32m")
set(Yellow "${Esc}[33m")
set(Blue "${Esc}[34m")
set(Magenta "${Esc}[35m")
set(Cyan "${Esc}[36m")
set(White "${Esc}[37m")
set(BoldRed "${Esc}[1;31m")
set(BoldGreen "${Esc}[1;32m")
set(BoldYellow "${Esc}[1;33m")
set(BoldBlue "${Esc}[1;34m")
set(BoldMagenta "${Esc}[1;35m")
set(BoldCyan "${Esc}[1;36m")
set(BoldWhite "${Esc}[1;37m")
endif()

if(XPU)
set(RUNTIME_KUNLUN_PATH ${CMAKE_CURRENT_SOURCE_DIR})
message(STATUS "RUNTIME_KUNLUN_PATH is ${RUNTIME_KUNLUN_PATH} .\n")
set(KUNLUN_XPU_PATH ${RUNTIME_KUNLUN_PATH}/xpu)
if(NOT DEFINED ENV{XPU_API_PATH})
message(FATAL_ERROR "${BoldRed}NO ENV{XPU_API_PATH} in your env. Please set XPU_API_PATH.${ColourReset}\n")
else()
set(XPU_API_PATH $ENV{XPU_API_PATH})
message("set XPU_API_PATH from env_var. Val is $ENV{XPU_API_PATH}.")
endif()

include_directories(${RUNTIME_KUNLUN_PATH} ${KUNLUN_XPU_PATH}/
${XPU_API_PATH}/output/include ${XPU_API_PATH}/../runtime/include)
link_directories(${XPU_API_PATH}/output/so/ ${XPU_API_PATH}/../runtime/output/so/)

add_definitions(-DUSE_XPU)
endif()
10 changes: 7 additions & 3 deletions runtime/core/decoder/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ set(decoder_srcs
ctc_endpoint.cc
)

if(NOT TORCH AND NOT ONNX)
message(FATAL_ERROR "Please build with TORCH or ONNX!!!")
if(NOT TORCH AND NOT ONNX AND NOT XPU)
message(FATAL_ERROR "Please build with TORCH or ONNX or XPU!!!")
endif()
if(TORCH)
list(APPEND decoder_srcs torch_asr_model.cc)
Expand All @@ -18,7 +18,8 @@ if(ONNX)
endif()

add_library(decoder STATIC ${decoder_srcs})
target_link_libraries(decoder PUBLIC kaldi-decoder frontend post_processor utils)
target_link_libraries(decoder PUBLIC kaldi-decoder frontend
post_processor utils)

if(ANDROID)
target_link_libraries(decoder PUBLIC ${PYTORCH_LIBRARY} ${FBJNI_LIBRARY})
Expand All @@ -29,4 +30,7 @@ else()
if(ONNX)
target_link_libraries(decoder PUBLIC onnxruntime)
endif()
if(XPU)
target_link_libraries(decoder PUBLIC xpu_conformer)
endif()
endif()
26 changes: 23 additions & 3 deletions runtime/core/decoder/params.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.


#ifndef DECODER_PARAMS_H_
#define DECODER_PARAMS_H_

Expand All @@ -29,17 +28,24 @@
#ifdef USE_TORCH
#include "decoder/torch_asr_model.h"
#endif
#ifdef USE_XPU
#include "xpu/xpu_asr_model.h"
#endif
#include "frontend/feature_pipeline.h"
#include "post_processor/post_processor.h"
#include "utils/flags.h"
#include "utils/string.h"

DEFINE_int32(num_threads, 1, "num threads for ASR model");
DEFINE_int32(device_id, 0, "set XPU DeviceID for ASR model");

// TorchAsrModel flags
DEFINE_string(model_path, "", "pytorch exported model path");
// OnnxAsrModel flags
DEFINE_string(onnx_dir, "", "directory where the onnx model is saved");
// XPUAsrModel flags
DEFINE_string(xpu_model_dir, "",
"directory where the XPU model and weights is saved");

// FeaturePipelineConfig flags
DEFINE_int32(num_bins, 80, "num mel bins for fbank feature");
Expand All @@ -66,7 +72,8 @@ DEFINE_double(lattice_beam, 10.0, "lattice beam in ctc wfst search");
DEFINE_double(acoustic_scale, 1.0, "acoustic scale for ctc wfst search");
DEFINE_double(blank_skip_thresh, 1.0,
"blank skip thresh for ctc wfst search, 1.0 means no skip");
DEFINE_double(length_penalty, 0.0, "length penalty ctc wfst search, will not"
DEFINE_double(length_penalty, 0.0,
"length penalty ctc wfst search, will not"
"apply on self-loop arc, for balancing the del/ins ratio, "
"suggest set to -3.0");
DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search");
Expand Down Expand Up @@ -130,7 +137,7 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
#else
LOG(FATAL) << "Please rebuild with cmake options '-DONNX=ON'.";
#endif
} else {
} else if (!FLAGS_model_path.empty()) {
#ifdef USE_TORCH
LOG(INFO) << "Reading torch model " << FLAGS_model_path;
TorchAsrModel::InitEngineThreads(FLAGS_num_threads);
Expand All @@ -140,6 +147,19 @@ std::shared_ptr<DecodeResource> InitDecodeResourceFromFlags() {
#else
LOG(FATAL) << "Please rebuild with cmake options '-DTORCH=ON'.";
#endif
} else if (!FLAGS_xpu_model_dir.empty()) {
#ifdef USE_XPU
LOG(INFO) << "Reading XPU WeNet model weight from " << FLAGS_xpu_model_dir;
auto model = std::make_shared<XPUAsrModel>();
model->SetEngineThreads(FLAGS_num_threads);
model->SetDeviceId(FLAGS_device_id);
model->Read(FLAGS_xpu_model_dir);
resource->model = model;
#else
LOG(FATAL) << "Please rebuild with cmake options '-DXPU=ON'.";
#endif
} else {
LOG(FATAL) << "Please set ONNX, TORCH or XPU model path!!!";
}

LOG(INFO) << "Reading unit table " << FLAGS_unit_path;
Expand Down
2 changes: 2 additions & 0 deletions runtime/kunlun/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
build/
fc_base/
68 changes: 68 additions & 0 deletions runtime/kunlun/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)

project(wenet VERSION 0.1)

option(CXX11_ABI "whether to use CXX11_ABI libtorch" OFF)
option(FST_HAVE_BIN "whether to build fst binaries" OFF)
option(BUILD_TESTING "whether to build unit test" OFF)
option(GRPC "whether to build with gRPC" OFF)
# TODO(Binbin Zhang): Change websocket to OFF since it depends on boost
# which is a very big library
option(WEBSOCKET "whether to build with websocket" OFF)
option(XPU "whether to build with XPU" ON)

set(CMAKE_VERBOSE_MAKEFILE OFF)

include(FetchContent)
include(ExternalProject)
set(FETCHCONTENT_QUIET OFF)
get_filename_component(fc_base "fc_base" REALPATH BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_base})

list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC")

# Include all dependency
include(openfst)
if(XPU)
include(xpu)
# compile xpu_conformer.a and conformer_test
add_subdirectory(xpu)
endif()
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/kaldi
)

# Build all libraries
add_subdirectory(utils)
if(NOT MSVC)
add_dependencies(utils openfst)
endif()
add_subdirectory(frontend)
add_subdirectory(post_processor)
add_subdirectory(kaldi) # kaldi: wfst based decoder
add_subdirectory(decoder)
add_subdirectory(api)

# Optionally, you can build with websocket
if(WEBSOCKET)
include(boost)
add_subdirectory(websocket)
endif()

# Optionally, you can build with gRPC
if(GRPC)
include(grpc)
add_subdirectory(grpc)
endif()

# Build all bins
add_subdirectory(bin)

# Unit Test
if(BUILD_TESTING)
include(gtest)
add_subdirectory(test)
endif()
83 changes: 83 additions & 0 deletions runtime/kunlun/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# 在昆仑芯片上运行Wenet
## 介绍
下面的示例展示了如何在XPU上部署WeNet离线或在线的ASR模型。XPU是一种由昆仑芯100%自主研发的通用人工智能计算核心架构。

## 准备XPU运行环境

在开始之前,请确认您获得以下必须的环境。

XRE(XPU Runtime Environment):昆仑芯片的基础运行环境,包括芯片驱动程序、runtime api库、固件FW工具等功能模块。
XDNN(XPU Deep Neural Network Library):加速深度神经网络的昆仑芯片库,提供应用程序中使用的高性能DNN功能库。

如果您需要任何帮助,或是想要进一步了解昆仑芯片,请通过官方网址联系我们:
https://www.kunlunxin.com.cn/

## 操作步骤
- 第一步:构建,需要cmake 3.14及以上版本

``` sh
export CXX=${your_g++_path}
export CC=${your_gcc_path}
export XPU_API_PATH=${your_api_path}

# -r : release version; -d : debug version
bash ./compile.sh -r
```

- 第二步:测试,测试结果将在控制台输出

``` sh
## set KUNLUN XPU visible device
export XPU_VISIBLE_DEVICES=0
export XPUSIM_DEVICE_MODEL=KUNLUN2
## set logging level
export GLOG_logtostderr=1
export GLOG_v=3
## set speech wav and model/weight path
wav_path=${your_test_wav_path}
xpu_model_dir=${your_xpu_weight_dir}
units=${your_units.txt}
## executive command
./build/bin/decoder_main \
--chunk_size -1 \
--wav_path ${wav_path} \
--xpu_model_dir ${xpu_model_di} \
--unit_path ${units} \
--device_id 0 \
--nbest 3 2>&1 | tee log.txt
```

单条语音执行结果如下所示:

``` sh
XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4
I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6
I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538
I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538
I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1
I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418
I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103
I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512
I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538
I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3
I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14
I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
test 甚至出现交易几乎停滞的情况
I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
```
87 changes: 87 additions & 0 deletions runtime/kunlun/README_EN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# WeNet running on KUNLUNXIN XPU device
## Introduction
The below example shows how to deploy WeNet offline and online ASR models on XPUs.
XPU is a core architecture 100% independently developed by KUNLUNXIN for general artificial intelligence computing.

## Setup environment for XPU device

Before the start, makesure you have these necessary environment

XRE(XPU Runtime Environment):The basic operating environment of the XPUs
includes functional modules such as chip drivers, runtime api library, and firmware tools.

XDNN(XPU Deep Neural Network Library): XPU library for accelerating deep neural networks, providing high-performance DNN function library used in applications.

If you would like to know more about XPUs or need any help, please contact us through the official website:

https://www.kunlunxin.com.cn/

## Instruction
- Step 1. Build, the build requires cmake 3.14 or above.

``` sh
export CXX=${your_g++_path}
export CC=${your_gcc_path}
export XPU_API_PATH=${your_api_path}

# -r : release version; -d : debug version
bash ./compile.sh -r
```

- Step 2. Testing, the result is shown in the console.

``` sh
## set KUNLUN XPU visible device
export XPU_VISIBLE_DEVICES=0
export XPUSIM_DEVICE_MODEL=KUNLUN2
## set logging level
export GLOG_logtostderr=1
export GLOG_v=3
## set speech wav and model/weight/units path
wav_path=${your_test_wav_path}
xpu_model_dir=${your_xpu_weight_dir}
units=${your_units.txt}
## executive command
./build/bin/decoder_main \
--chunk_size -1 \
--wav_path $wav_path \
--xpu_model_dir $xpu_model_dir \
--unit_path $units \
--device_id 0 \
--nbest 3 2>&1 | tee log.txt
```

A typical output result is as following:

``` sh
XPURT /docker_workspace/icode-api/baidu/xpu/api/../runtime/output/so/libxpurt.so loaded
I1027 06:06:21.933722 111767 params.h:152] Reading XPU WeNet model weight from /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/
I1027 06:06:21.934103 111767 xpu_asr_model.cc:46] XPU weight_dir is: /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data//model_weights/
I1027 06:06:23.832731 111767 xpu_asr_model.cc:65] ======= XPU Kunlun Model Info: =======
I1027 06:06:23.832749 111767 xpu_asr_model.cc:66] subsampling_rate 4
I1027 06:06:23.832777 111767 xpu_asr_model.cc:67] right_context 6
I1027 06:06:23.832789 111767 xpu_asr_model.cc:68] sos 5538
I1027 06:06:23.832795 111767 xpu_asr_model.cc:69] eos 5538
I1027 06:06:23.832799 111767 xpu_asr_model.cc:70] is bidirectional decoder 1
I1027 06:06:23.832804 111767 params.h:165] Reading unit table /docker_workspace/icode-api/baidu/xpu/api/example/wenet-conformer/all_data/dict
I1027 06:06:23.843475 111776 decoder_main.cc:54] num frames 418
I1027 06:06:23.843521 111776 asr_decoder.cc:104] Required 2147483647 get 418
I1027 06:06:23.843528 111776 xpu_asr_model.cc:116] Now Use XPU:0!
I1027 06:06:23.843616 111776 xpu_asr_model.cc:173] max_seqlen is 418
I1027 06:06:23.843619 111776 xpu_asr_model.cc:174] q_seqlen is 103
I1027 06:06:23.843623 111776 xpu_asr_model.cc:175] att_dim is 512
I1027 06:06:23.843626 111776 xpu_asr_model.cc:176] ctc_dim is 5538
I1027 06:06:23.852284 111776 asr_decoder.cc:113] forward takes 7 ms, search takes 1 ms
I1027 06:06:23.852383 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852530 111776 asr_decoder.cc:194] Partial CTC result 甚至出现交易几乎停滞的情况
I1027 06:06:23.852537 111776 xpu_asr_model.cc:248] num_hyps is 3
I1027 06:06:23.852541 111776 xpu_asr_model.cc:249] beam_size is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:250] new_bs is 3
I1027 06:06:23.852545 111776 xpu_asr_model.cc:251] max_hyps_len is 14
I1027 06:06:23.853902 111776 asr_decoder.cc:84] Rescoring cost latency: 1ms.
I1027 06:06:23.853911 111776 decoder_main.cc:72] Partial result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853914 111776 decoder_main.cc:104] test Final result: 甚至出现交易几乎停滞的情况
I1027 06:06:23.853924 111776 decoder_main.cc:105] Decoded 4203ms audio taken 10ms.
test 甚至出现交易几乎停滞的情况
I1027 06:06:23.853984 111767 decoder_main.cc:180] Total: decoded 4203ms audio taken 10ms.
```
1 change: 1 addition & 0 deletions runtime/kunlun/api
1 change: 1 addition & 0 deletions runtime/kunlun/bin
1 change: 1 addition & 0 deletions runtime/kunlun/cmake
Loading

0 comments on commit 70b5b69

Please sign in to comment.