From 198a38182cb5751c1722c0a51f3c9067d25812c9 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 9 Dec 2024 17:41:16 +0800 Subject: [PATCH 1/2] Support speaker embedding extractor API for HarmonyOS --- sherpa-onnx/c-api/c-api.cc | 33 +++++++++++++++- sherpa-onnx/c-api/c-api.h | 5 +++ sherpa-onnx/csrc/offline-tts-vits-impl.h | 4 +- ...speaker-embedding-extractor-general-impl.h | 16 ++++++-- .../csrc/speaker-embedding-extractor-impl.cc | 32 ++++++++++++++- .../csrc/speaker-embedding-extractor-impl.h | 10 +---- .../csrc/speaker-embedding-extractor-model.cc | 39 ++++++++++++++++--- .../csrc/speaker-embedding-extractor-model.h | 10 +---- .../speaker-embedding-extractor-nemo-impl.h | 16 ++++++-- .../speaker-embedding-extractor-nemo-model.cc | 38 +++++++++++++++--- .../speaker-embedding-extractor-nemo-model.h | 10 +---- .../csrc/speaker-embedding-extractor.cc | 24 ++++++++++-- .../csrc/speaker-embedding-extractor.h | 10 +---- 13 files changed, 188 insertions(+), 59 deletions(-) diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index e25097809..7748b9fee 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1328,8 +1328,8 @@ struct SherpaOnnxSpeakerEmbeddingExtractor { std::unique_ptr impl; }; -const SherpaOnnxSpeakerEmbeddingExtractor * -SherpaOnnxCreateSpeakerEmbeddingExtractor( +static sherpa_onnx::SpeakerEmbeddingExtractorConfig +GetSpeakerEmbeddingExtractorConfig( const SherpaOnnxSpeakerEmbeddingExtractorConfig *config) { sherpa_onnx::SpeakerEmbeddingExtractorConfig c; c.model = SHERPA_ONNX_OR(config->model, ""); @@ -1342,9 +1342,21 @@ SherpaOnnxCreateSpeakerEmbeddingExtractor( } if (config->debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s\n", c.ToString().c_str()); +#else SHERPA_ONNX_LOGE("%s\n", c.ToString().c_str()); +#endif } + return c; +} + +const SherpaOnnxSpeakerEmbeddingExtractor * +SherpaOnnxCreateSpeakerEmbeddingExtractor( + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config) { + auto c = GetSpeakerEmbeddingExtractorConfig(config); + if (!c.Validate()) { SHERPA_ONNX_LOGE("Errors in config!"); return nullptr; @@ -1983,6 +1995,23 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetectorOHOS( return p; } +const SherpaOnnxSpeakerEmbeddingExtractor * +SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS( + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config, + NativeResourceManager *mgr) { + if (!mgr) { + return SherpaOnnxCreateSpeakerEmbeddingExtractor(config); + } + + auto c = GetSpeakerEmbeddingExtractorConfig(config); + + auto p = new SherpaOnnxSpeakerEmbeddingExtractor; + + p->impl = std::make_unique(mgr, c); + + return p; +} + #if SHERPA_ONNX_ENABLE_TTS == 1 SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr) { diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index fde626e99..111aae779 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1572,6 +1572,11 @@ SherpaOnnxCreateVoiceActivityDetectorOHOS( SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr); + +SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * +SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS( + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config, + NativeResourceManager *mgr); #endif #if defined(__GNUC__) diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 5ef79f69b..560576357 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -62,9 +62,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { for (const auto &f : files) { if (config.model.debug) { #if __OHOS__ - SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); -#else SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); +#else + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); #endif } std::unique_ptr> reader( diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h index ca384c855..8a884e85d 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h @@ -22,11 +22,10 @@ class SpeakerEmbeddingExtractorGeneralImpl const SpeakerEmbeddingExtractorConfig &config) : model_(config) {} -#if __ANDROID_API__ >= 9 + template SpeakerEmbeddingExtractorGeneralImpl( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : model_(mgr, config) {} -#endif int32_t Dim() const override { return model_.GetMetaData().output_dim; } @@ -46,9 +45,15 @@ class SpeakerEmbeddingExtractorGeneralImpl std::vector Compute(OnlineStream *s) const override { int32_t num_frames = s->NumFramesReady() - s->GetNumProcessedFrames(); if (num_frames <= 0) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "Please make sure IsReady(s) returns true. num_frames: %{public}d", + num_frames); +#else SHERPA_ONNX_LOGE( "Please make sure IsReady(s) returns true. num_frames: %d", num_frames); +#endif return {}; } @@ -64,8 +69,13 @@ class SpeakerEmbeddingExtractorGeneralImpl if (meta_data.feature_normalize_type == "global-mean") { SubtractGlobalMean(features.data(), num_frames, feat_dim); } else { +#if __OHOS__ + SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %{public}s", + meta_data.feature_normalize_type.c_str()); +#else SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %s", meta_data.feature_normalize_type.c_str()); +#endif exit(-1); } } diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc index b9591d624..650b1576a 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc @@ -3,6 +3,15 @@ // Copyright (c) 2024 Xiaomi Corporation #include "sherpa-onnx/csrc/speaker-embedding-extractor-impl.h" +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/speaker-embedding-extractor-general-impl.h" @@ -35,7 +44,11 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, if (debug) { std::ostringstream os; PrintModelMetadata(os, meta_data); +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s", os.str().c_str()); +#else SHERPA_ONNX_LOGE("%s", os.str().c_str()); +#endif } Ort::AllocatorWithDefaultOptions allocator; @@ -59,7 +72,11 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, } else if (model_type == "nemo") { return ModelType::kNeMo; } else { +#if __OHOS__ + SHERPA_ONNX_LOGE("Unsupported model_type: %{public}s", model_type.c_str()); +#else SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.c_str()); +#endif return ModelType::kUnknown; } } @@ -91,10 +108,10 @@ SpeakerEmbeddingExtractorImpl::Create( return nullptr; } -#if __ANDROID_API__ >= 9 +template std::unique_ptr SpeakerEmbeddingExtractorImpl::Create( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) { + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) { ModelType model_type = ModelType::kUnknown; { @@ -120,6 +137,17 @@ SpeakerEmbeddingExtractorImpl::Create( // unreachable code return nullptr; } + +#if __ANDROID_API__ >= 9 +template std::unique_ptr +SpeakerEmbeddingExtractorImpl::Create( + AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + +#if __OHOS__ +template std::unique_ptr +SpeakerEmbeddingExtractorImpl::Create( + NativeResourceManager *mgr, const SpeakerEmbeddingExtractorConfig &config); #endif } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h index 9465ab94e..6299dce4b 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.h @@ -9,11 +9,6 @@ #include #include -#if __ANDROID_API__ >= 9 -#include "android/asset_manager.h" -#include "android/asset_manager_jni.h" -#endif - #include "sherpa-onnx/csrc/speaker-embedding-extractor.h" namespace sherpa_onnx { @@ -25,10 +20,9 @@ class SpeakerEmbeddingExtractorImpl { static std::unique_ptr Create( const SpeakerEmbeddingExtractorConfig &config); -#if __ANDROID_API__ >= 9 + template static std::unique_ptr Create( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config); -#endif + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config); virtual int32_t Dim() const = 0; diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc index e5fa26eed..48d7f19e0 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-model.cc @@ -8,6 +8,15 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/session.h" @@ -28,8 +37,8 @@ class SpeakerEmbeddingExtractorModel::Impl { } } -#if __ANDROID_API__ >= 9 - Impl(AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + template + Impl(Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : config_(config), env_(ORT_LOGGING_LEVEL_ERROR), sess_opts_(GetSessionOptions(config)), @@ -39,7 +48,6 @@ class SpeakerEmbeddingExtractorModel::Impl { Init(buf.data(), buf.size()); } } -#endif Ort::Value Compute(Ort::Value x) const { std::array inputs = {std::move(x)}; @@ -68,7 +76,11 @@ class SpeakerEmbeddingExtractorModel::Impl { if (config_.debug) { std::ostringstream os; PrintModelMetadata(os, meta_data); +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s", os.str().c_str()); +#else SHERPA_ONNX_LOGE("%s", os.str().c_str()); +#endif } Ort::AllocatorWithDefaultOptions allocator; // used in the macro below @@ -84,8 +96,14 @@ class SpeakerEmbeddingExtractorModel::Impl { std::string framework; SHERPA_ONNX_READ_META_DATA_STR(framework, "framework"); if (framework != "wespeaker" && framework != "3d-speaker") { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "Expect a wespeaker or a 3d-speaker model, given: %{public}s", + framework.c_str()); +#else SHERPA_ONNX_LOGE("Expect a wespeaker or a 3d-speaker model, given: %s", framework.c_str()); +#endif exit(-1); } } @@ -111,11 +129,10 @@ SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel( const SpeakerEmbeddingExtractorConfig &config) : impl_(std::make_unique(config)) {} -#if __ANDROID_API__ >= 9 +template SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : impl_(std::make_unique(mgr, config)) {} -#endif SpeakerEmbeddingExtractorModel::~SpeakerEmbeddingExtractorModel() = default; @@ -128,4 +145,14 @@ Ort::Value SpeakerEmbeddingExtractorModel::Compute(Ort::Value x) const { return impl_->Compute(std::move(x)); } +#if __ANDROID_API__ >= 9 +template SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel( + AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + +#if __OHOS__ +template SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel( + NativeResourceManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-model.h b/sherpa-onnx/csrc/speaker-embedding-extractor-model.h index 83ef0cc0d..6c6bdd01c 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-model.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-model.h @@ -6,11 +6,6 @@ #include -#if __ANDROID_API__ >= 9 -#include "android/asset_manager.h" -#include "android/asset_manager_jni.h" -#endif - #include "onnxruntime_cxx_api.h" // NOLINT #include "sherpa-onnx/csrc/speaker-embedding-extractor-model-meta-data.h" #include "sherpa-onnx/csrc/speaker-embedding-extractor.h" @@ -22,10 +17,9 @@ class SpeakerEmbeddingExtractorModel { explicit SpeakerEmbeddingExtractorModel( const SpeakerEmbeddingExtractorConfig &config); -#if __ANDROID_API__ >= 9 - SpeakerEmbeddingExtractorModel(AAssetManager *mgr, + template + SpeakerEmbeddingExtractorModel(Manager *mgr, const SpeakerEmbeddingExtractorConfig &config); -#endif ~SpeakerEmbeddingExtractorModel(); diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h index 7e0883085..e4bacbb05 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h @@ -22,11 +22,10 @@ class SpeakerEmbeddingExtractorNeMoImpl : public SpeakerEmbeddingExtractorImpl { const SpeakerEmbeddingExtractorConfig &config) : model_(config) {} -#if __ANDROID_API__ >= 9 + template SpeakerEmbeddingExtractorNeMoImpl( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : model_(mgr, config) {} -#endif int32_t Dim() const override { return model_.GetMetaData().output_dim; } @@ -54,9 +53,15 @@ class SpeakerEmbeddingExtractorNeMoImpl : public SpeakerEmbeddingExtractorImpl { std::vector Compute(OnlineStream *s) const override { int32_t num_frames = s->NumFramesReady() - s->GetNumProcessedFrames(); if (num_frames <= 0) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "Please make sure IsReady(s) returns true. num_frames: %{public}d", + num_frames); +#else SHERPA_ONNX_LOGE( "Please make sure IsReady(s) returns true. num_frames: %d", num_frames); +#endif return {}; } @@ -72,8 +77,13 @@ class SpeakerEmbeddingExtractorNeMoImpl : public SpeakerEmbeddingExtractorImpl { if (meta_data.feature_normalize_type == "per_feature") { NormalizePerFeature(features.data(), num_frames, feat_dim); } else { +#if __OHOS__ SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %s", meta_data.feature_normalize_type.c_str()); +#else + SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %{public}s", + meta_data.feature_normalize_type.c_str()); +#endif exit(-1); } } diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.cc index 1b60e2469..3983e1cb8 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.cc @@ -8,6 +8,15 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/session.h" @@ -28,8 +37,8 @@ class SpeakerEmbeddingExtractorNeMoModel::Impl { } } -#if __ANDROID_API__ >= 9 - Impl(AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + template + Impl(Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : config_(config), env_(ORT_LOGGING_LEVEL_ERROR), sess_opts_(GetSessionOptions(config)), @@ -39,7 +48,6 @@ class SpeakerEmbeddingExtractorNeMoModel::Impl { Init(buf.data(), buf.size()); } } -#endif Ort::Value Compute(Ort::Value x, Ort::Value x_lens) const { std::array inputs = {std::move(x), std::move(x_lens)}; @@ -73,7 +81,11 @@ class SpeakerEmbeddingExtractorNeMoModel::Impl { if (config_.debug) { std::ostringstream os; PrintModelMetadata(os, meta_data); +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s", os.str().c_str()); +#else SHERPA_ONNX_LOGE("%s", os.str().c_str()); +#endif } Ort::AllocatorWithDefaultOptions allocator; // used in the macro below @@ -93,7 +105,12 @@ class SpeakerEmbeddingExtractorNeMoModel::Impl { std::string framework; SHERPA_ONNX_READ_META_DATA_STR(framework, "framework"); if (framework != "nemo") { +#if __OHOS__ + SHERPA_ONNX_LOGE("Expect a NeMo model, given: %{public}s", + framework.c_str()); +#else SHERPA_ONNX_LOGE("Expect a NeMo model, given: %s", framework.c_str()); +#endif exit(-1); } } @@ -119,11 +136,10 @@ SpeakerEmbeddingExtractorNeMoModel::SpeakerEmbeddingExtractorNeMoModel( const SpeakerEmbeddingExtractorConfig &config) : impl_(std::make_unique(config)) {} -#if __ANDROID_API__ >= 9 +template SpeakerEmbeddingExtractorNeMoModel::SpeakerEmbeddingExtractorNeMoModel( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : impl_(std::make_unique(mgr, config)) {} -#endif SpeakerEmbeddingExtractorNeMoModel::~SpeakerEmbeddingExtractorNeMoModel() = default; @@ -142,4 +158,14 @@ OrtAllocator *SpeakerEmbeddingExtractorNeMoModel::Allocator() const { return impl_->Allocator(); } +#if __ANDROID_API__ >= 9 +template SpeakerEmbeddingExtractorNeMoModel::SpeakerEmbeddingExtractorNeMoModel( + AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + +#if __OHOS__ +template SpeakerEmbeddingExtractorNeMoModel::SpeakerEmbeddingExtractorNeMoModel( + NativeResourceManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.h b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.h index af0623724..ed61ee8b5 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model.h @@ -6,11 +6,6 @@ #include -#if __ANDROID_API__ >= 9 -#include "android/asset_manager.h" -#include "android/asset_manager_jni.h" -#endif - #include "onnxruntime_cxx_api.h" // NOLINT #include "sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h" #include "sherpa-onnx/csrc/speaker-embedding-extractor.h" @@ -22,10 +17,9 @@ class SpeakerEmbeddingExtractorNeMoModel { explicit SpeakerEmbeddingExtractorNeMoModel( const SpeakerEmbeddingExtractorConfig &config); -#if __ANDROID_API__ >= 9 + template SpeakerEmbeddingExtractorNeMoModel( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config); -#endif + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config); ~SpeakerEmbeddingExtractorNeMoModel(); diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor.cc b/sherpa-onnx/csrc/speaker-embedding-extractor.cc index d90b0b1e0..5d52fb2f9 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor.cc @@ -6,6 +6,15 @@ #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + #include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/speaker-embedding-extractor-impl.h" @@ -55,11 +64,10 @@ SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor( const SpeakerEmbeddingExtractorConfig &config) : impl_(SpeakerEmbeddingExtractorImpl::Create(config)) {} -#if __ANDROID_API__ >= 9 +template SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor( - AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) + Manager *mgr, const SpeakerEmbeddingExtractorConfig &config) : impl_(SpeakerEmbeddingExtractorImpl::Create(mgr, config)) {} -#endif SpeakerEmbeddingExtractor::~SpeakerEmbeddingExtractor() = default; @@ -77,4 +85,14 @@ std::vector SpeakerEmbeddingExtractor::Compute(OnlineStream *s) const { return impl_->Compute(s); } +#if __ANDROID_API__ >= 9 +template SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor( + AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + +#if __OHOS__ +template SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor( + NativeResourceManager *mgr, const SpeakerEmbeddingExtractorConfig &config); +#endif + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor.h b/sherpa-onnx/csrc/speaker-embedding-extractor.h index 4d9783c85..068e6b8d3 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor.h @@ -9,11 +9,6 @@ #include #include -#if __ANDROID_API__ >= 9 -#include "android/asset_manager.h" -#include "android/asset_manager_jni.h" -#endif - #include "sherpa-onnx/csrc/online-stream.h" #include "sherpa-onnx/csrc/parse-options.h" @@ -45,10 +40,9 @@ class SpeakerEmbeddingExtractor { explicit SpeakerEmbeddingExtractor( const SpeakerEmbeddingExtractorConfig &config); -#if __ANDROID_API__ >= 9 - SpeakerEmbeddingExtractor(AAssetManager *mgr, + template + SpeakerEmbeddingExtractor(Manager *mgr, const SpeakerEmbeddingExtractorConfig &config); -#endif ~SpeakerEmbeddingExtractor(); From 8fb3d00c9b890a84a4d023cbb984ffa57ed5e229 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 9 Dec 2024 19:20:27 +0800 Subject: [PATCH 2/2] Add ArkTS API for speaker identification --- .gitignore | 2 + .../SherpaOnnxHar/sherpa_onnx/Index.ets | 6 + .../src/main/cpp/speaker-identification.cc | 22 +++ .../main/cpp/types/libsherpa_onnx/Index.d.ts | 15 ++ .../main/ets/components/NonStreamingTts.ets | 2 +- .../ets/components/SpeakerIdentification.ets | 139 ++++++++++++++++++ .../speaker-embedding-extractor-nemo-impl.h | 5 +- 7 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/SpeakerIdentification.ets diff --git a/.gitignore b/.gitignore index 18178f5fa..cfb6fa57c 100644 --- a/.gitignore +++ b/.gitignore @@ -123,3 +123,5 @@ sherpa-onnx-online-punct-en-2024-08-06 sherpa-onnx-pyannote-segmentation-3-0 sherpa-onnx-moonshine-tiny-en-int8 sherpa-onnx-moonshine-base-en-int8 +harmony-os/SherpaOnnxHar/sherpa_onnx/LICENSE +harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets index 14dff071e..5132df5f1 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets @@ -51,3 +51,9 @@ export { TtsOutput, TtsInput, } from './src/main/ets/components/NonStreamingTts'; + +export { + SpeakerEmbeddingExtractorConfig, + SpeakerEmbeddingExtractor, + SpeakerEmbeddingManager, +} from './src/main/ets/components/SpeakerIdentification'; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/speaker-identification.cc b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/speaker-identification.cc index a08a6ed66..21c9a89e7 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/speaker-identification.cc +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/speaker-identification.cc @@ -11,6 +11,17 @@ static Napi::External CreateSpeakerEmbeddingExtractorWrapper(const Napi::CallbackInfo &info) { Napi::Env env = info.Env(); + +#if __OHOS__ + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } +#else if (info.Length() != 1) { std::ostringstream os; os << "Expect only 1 argument. Given: " << info.Length(); @@ -19,6 +30,7 @@ CreateSpeakerEmbeddingExtractorWrapper(const Napi::CallbackInfo &info) { return {}; } +#endif if (!info[0].IsObject()) { Napi::TypeError::New(env, "You should pass an object as the only argument.") @@ -46,8 +58,18 @@ CreateSpeakerEmbeddingExtractorWrapper(const Napi::CallbackInfo &info) { SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); +#if __OHOS__ + std::unique_ptr + mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]), + &OH_ResourceManager_ReleaseNativeResourceManager); + + const SherpaOnnxSpeakerEmbeddingExtractor *extractor = + SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS(&c, mgr.get()); +#else const SherpaOnnxSpeakerEmbeddingExtractor *extractor = SherpaOnnxCreateSpeakerEmbeddingExtractor(&c); +#endif if (c.model) { delete[] c.model; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts index 057d5af25..d2b6d6ea4 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts @@ -47,3 +47,18 @@ export type TtsOutput = { export const offlineTtsGenerate: (handle: object, input: object) => TtsOutput; export const offlineTtsGenerateAsync: (handle: object, input: object) => Promise; + +export const createSpeakerEmbeddingExtractor: (config: object, mgr?: object) => object; +export const speakerEmbeddingExtractorDim: (handle: object) => number; +export const speakerEmbeddingExtractorCreateStream: (handle: object) => object; +export const speakerEmbeddingExtractorIsReady: (handle: object, stream: object) => boolean; +export const speakerEmbeddingExtractorComputeEmbedding: (handle: object, stream: object, enableExternalBuffer: boolean) => Float32Array; +export const createSpeakerEmbeddingManager: (dim: number) => object; +export const speakerEmbeddingManagerAdd: (handle: object, speaker: {name: string, v: Float32Array}) => boolean; +export const speakerEmbeddingManagerAddListFlattened: (handle: object, speaker: {name: string, vv: Float32Array, n: number}) => boolean; +export const speakerEmbeddingManagerRemove: (handle: object, name: string) => boolean; +export const speakerEmbeddingManagerSearch: (handle: object, obj: {v: Float32Array, threshold: number}) => string; +export const speakerEmbeddingManagerVerify: (handle: object, obj: {name: string, v: Float32Array, threshold: number}) => boolean; +export const speakerEmbeddingManagerContains: (handle: object, name: string) => boolean; +export const speakerEmbeddingManagerNumSpeakers: (handle: object) => number; +export const speakerEmbeddingManagerGetAllSpeakers: (handle: object) => Array; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets index a60a0e748..556877489 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets @@ -4,7 +4,7 @@ import { getOfflineTtsSampleRate, offlineTtsGenerate, offlineTtsGenerateAsync, -} from "libsherpa_onnx.so"; +} from 'libsherpa_onnx.so'; export class OfflineTtsVitsModelConfig { public model: string = ''; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/SpeakerIdentification.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/SpeakerIdentification.ets new file mode 100644 index 000000000..e490ab15f --- /dev/null +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/SpeakerIdentification.ets @@ -0,0 +1,139 @@ +import { + createSpeakerEmbeddingExtractor, + createSpeakerEmbeddingManager, + speakerEmbeddingExtractorComputeEmbedding, + speakerEmbeddingExtractorCreateStream, + speakerEmbeddingExtractorDim, + speakerEmbeddingExtractorIsReady, + speakerEmbeddingManagerAdd, + speakerEmbeddingManagerAddListFlattened, + speakerEmbeddingManagerContains, + speakerEmbeddingManagerGetAllSpeakers, + speakerEmbeddingManagerNumSpeakers, + speakerEmbeddingManagerRemove, + speakerEmbeddingManagerSearch, + speakerEmbeddingManagerVerify +} from 'libsherpa_onnx.so'; +import { OnlineStream } from './StreamingAsr'; + +export class SpeakerEmbeddingExtractorConfig { + public model: string = ''; + public numThreads: number = 1; + public debug: boolean = false; + public provider: string = 'cpu'; +} + +export class SpeakerEmbeddingExtractor { + public config: SpeakerEmbeddingExtractorConfig = new SpeakerEmbeddingExtractorConfig(); + public dim: number; + private handle: object; + + constructor(config: SpeakerEmbeddingExtractorConfig, mgr?: object) { + this.handle = createSpeakerEmbeddingExtractor(config, mgr); + this.config = config; + this.dim = speakerEmbeddingExtractorDim(this.handle); + } + + createStream(): OnlineStream { + return new OnlineStream( + speakerEmbeddingExtractorCreateStream(this.handle)); + } + + isReady(stream: OnlineStream): boolean { + return speakerEmbeddingExtractorIsReady(this.handle, stream.handle); + } + + compute(stream: OnlineStream, enableExternalBuffer: boolean = true): Float32Array { + return speakerEmbeddingExtractorComputeEmbedding( + this.handle, stream.handle, enableExternalBuffer); + } +} + +function flatten(arrayList: Float32Array[]): Float32Array { + let n = 0; + for (let i = 0; i < arrayList.length; ++i) { + n += arrayList[i].length; + } + let ans = new Float32Array(n); + + let offset = 0; + for (let i = 0; i < arrayList.length; ++i) { + ans.set(arrayList[i], offset); + offset += arrayList[i].length; + } + return ans; +} + +interface SpeakerNameWithEmbedding { + name: string; + v: Float32Array; +} + +interface SpeakerNameWithEmbeddingList { + name: string; + v: Float32Array[]; +} + +interface SpeakerNameWithEmbeddingN { + name: string; + vv: Float32Array; + n: number; +} + +interface EmbeddingWithThreshold { + v: Float32Array; + threshold: number; +} + +interface SpeakerNameEmbeddingThreshold { + name: string; + v: Float32Array; + threshold: number; +} + +export class SpeakerEmbeddingManager { + public dim: number; + private handle: object; + + constructor(dim: number) { + this.handle = createSpeakerEmbeddingManager(dim); + this.dim = dim; + } + + add(speaker: SpeakerNameWithEmbedding): boolean { + return speakerEmbeddingManagerAdd(this.handle, speaker); + } + + addMulti(speaker: SpeakerNameWithEmbeddingList): boolean { + const c: SpeakerNameWithEmbeddingN = { + name: speaker.name, + vv: flatten(speaker.v), + n: speaker.v.length, + }; + return speakerEmbeddingManagerAddListFlattened(this.handle, c); + } + + remove(name: string): boolean { + return speakerEmbeddingManagerRemove(this.handle, name); + } + + search(obj: EmbeddingWithThreshold): string { + return speakerEmbeddingManagerSearch(this.handle, obj); + } + + verify(obj: SpeakerNameEmbeddingThreshold): boolean { + return speakerEmbeddingManagerVerify(this.handle, obj); + } + + contains(name: string): boolean { + return speakerEmbeddingManagerContains(this.handle, name); + } + + getNumSpeakers(): number { + return speakerEmbeddingManagerNumSpeakers(this.handle); + } + + getAllSpeakerNames(): string[] { + return speakerEmbeddingManagerGetAllSpeakers(this.handle); + } +} \ No newline at end of file diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h index e4bacbb05..ec1c44d68 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-nemo-impl.h @@ -78,10 +78,11 @@ class SpeakerEmbeddingExtractorNeMoImpl : public SpeakerEmbeddingExtractorImpl { NormalizePerFeature(features.data(), num_frames, feat_dim); } else { #if __OHOS__ - SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %s", + SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %{public}s", meta_data.feature_normalize_type.c_str()); #else - SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %{public}s", + + SHERPA_ONNX_LOGE("Unsupported feature_normalize_type: %s", meta_data.feature_normalize_type.c_str()); #endif exit(-1);