From 2bfe0ebc0f966f556f6c1cbb923c35bff5889001 Mon Sep 17 00:00:00 2001 From: sandrohanea <40202887+sandrohanea@users.noreply.github.com> Date: Wed, 8 Feb 2023 08:01:47 +0100 Subject: [PATCH] whisper : fixed Beam Search Strategy and exposed whisper_pcm_to_mel_phase_vocoder (#474) Co-authored-by: Sandro Hanea --- whisper.cpp | 4 ++-- whisper.h | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 35d42f894f3..aebb4813f08 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2905,7 +2905,7 @@ const char * whisper_print_system_info(void) { struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) { struct whisper_full_params result = { - /*.strategy =*/ WHISPER_SAMPLING_GREEDY, + /*.strategy =*/ strategy, /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()), /*.n_max_text_ctx =*/ 16384, @@ -3829,7 +3829,7 @@ int whisper_full( auto & cur = beam_candidates[cur_c++]; - while (beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) { + while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) { ++cur_c; } diff --git a/whisper.h b/whisper.h index 72331e6abd4..786d67d9cb4 100644 --- a/whisper.h +++ b/whisper.h @@ -113,6 +113,16 @@ extern "C" { int n_samples, int n_threads); + // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. + // The resulting spectrogram is stored inside the provided whisper context. + // Returns 0 on success + WHISPER_API int whisper_pcm_to_mel_phase_vocoder( + struct whisper_context* ctx, + const float* samples, + int n_samples, + int n_threads); + + // This can be used to set a custom log mel spectrogram inside the provided whisper context. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. // n_mel must be 80