Skip to content

Commit

Permalink
real real-time transcription(real-time subtitle) with English online-…
Browse files Browse the repository at this point in the history
…TV on Xiaomi 14 at the first time but bug-fix is still required
  • Loading branch information
zhouwg committed Mar 20, 2024
1 parent 8b14792 commit 4cd35dd
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ public void initGlobal() {
CDELibraryLoader.load("whispercpp");
CDELog.d(TAG, "cpu core counts:" + whispercpp.get_cpu_core_counts());
CDELog.j(TAG, "asr mode: " + mSettings.getASRMode());
CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
if ((CDEUtils.ASR_MODE_NORMAL == mSettings.getASRMode()) || (CDEUtils.ASR_MODE_TRANSCRIPTION_RECORD == mSettings.getASRMode())) {
result = whispercpp.asr_init(modelPath, mSettings.getASRThreadCounts(), WHISPER_ASR_MODE_NORMAL);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1575,6 +1575,7 @@ private void onASRStart(int asrMode) {
return;
} else {
CDELog.j(TAG, "ASR with GGML model file:" + file.getAbsolutePath());
CDELog.j(TAG, "thread counts:" + mSettings.getASRThreadCounts());
}

if (CDEUtils.getASRSubsystemInit()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,37 +120,18 @@ public void onPause() {
@Override
public void onSharedPreferenceChanged(SharedPreferences sharedPreferences, String key) {
CDELog.j(TAG, "key : " + key);
if (key.contains("pref.asrmode")) {
CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
CDELog.j(TAG, "modelPath:" + modelPath);
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
}

if (key.contains("pref.asrthreadcounts")) {
CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts() + 1);
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
CDELog.j(TAG, "modelPath:" + modelPath);
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
}


if (key.contains("pref.ggmlmodel")) {
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
if (
(key.contains("pref.asrmode"))
|| (key.contains("pref.asrthreadcounts"))
|| (key.contains("pref.ggmlmodel"))
) {
CDELog.j(TAG, "asrmode: " + mSettings.getASRMode());
CDELog.j(TAG, "asrthreadCounts " + mSettings.getASRThreadCounts());
CDELog.j(TAG, "GGML mode: " + mSettings.getGGMLMode());
CDELog.j(TAG, "GGML mode name: " + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()));
String modelPath = CDEUtils.getDataPath() + "ggml-" + CDEUtils.getGGMLModeString(mSettings.getGGMLMode()) + ".bin";
CDELog.j(TAG, "modelPath:" + modelPath);
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts() + 1, mSettings.getASRMode());
CDEUtils.setASRConfig("whispercpp", modelPath, mSettings.getASRThreadCounts(), mSettings.getASRMode());
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,12 @@ public int getASRMode() {

public int getASRThreadCounts() {
String key = mAppContext.getString(R.string.pref_key_asrthreadcounts);
String value = mSharedPreferences.getString(key, "3"); // thread counts 4
String value = mSharedPreferences.getString(key, "3"); // actual thread counts is 3 + 1 = 4
try {
return Integer.valueOf(value).intValue();
return Integer.valueOf(value).intValue() + 1;
} catch (NumberFormatException e) {
CDELog.j(TAG, "exception occurred");
return 3;
return 4;
}
}

Expand Down
5 changes: 5 additions & 0 deletions external/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@ gstreamer/
ncnn/
CLBlast/
llamacpp/
ff-deps/
ffdeps/
ffmepg-deps/


ffmpeg-6.1

*.a
*.so
File renamed without changes.
34 changes: 32 additions & 2 deletions external/whispercpp/jni/whispercpp-jni-impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ typedef struct {
char sz_model_path[MAX_PATH_LEN];
size_t n_threads;

//03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
size_t n_decoding_mode; // 0:WHISPER_SAMPLING_GREEDY 1:WHISPER_SAMPLING_BEAM_SEARCH

size_t n_asr_mode; // 0: normal transcription 1: asr pressure test 2:benchmark 3: transcription + audio record
size_t n_benchmark_type; // what to benchmark: 0: asr, 1: memcpy 2: mulmat 3: whisper_encode/whisper full benchmark
bool b_use_gpu;
Expand Down Expand Up @@ -847,7 +850,9 @@ class whisper_asr {
n_end_time = ggml_time_us();
n_durtion = (n_end_time - n_begin_time) / 1000;

if (n_durtion > 1000) { // 1 seconds, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
// 1 second, very good on Xiaomi 14, about 500-700 ms with GGML model ggml-tiny.en-q8_0.bin
// 0.8 second with new method(adjust audio_context dynamically) would cause app crash suddenly or produce sketchy/incorrect/repeat tokens
if (n_durtion > 900) {
LOGGD("duration of audio data gathering is: %d milliseconds\n", n_durtion);
LOGGD("size of gathered audio data: %d\n", _n_whisper_in_size);
LOGGD("total audio sample counts %d\n", _n_total_sample_counts);
Expand Down Expand Up @@ -1186,6 +1191,21 @@ static const char * whisper_asr_audio_to_text(const float * pf32_audio_buffer, i

begin_time = ggml_time_ms();
whisper_reset_timings(p_asr_ctx->p_context);

//03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
p_asr_ctx->p_params->max_tokens = 256;
p_asr_ctx->p_params->temperature_inc = 0.0f;
p_asr_ctx->p_params->audio_ctx = std::min(1500, (int)ceil((double)num_samples / (double)(320.0)) + 16);
if (WHISPER_SAMPLING_GREEDY == p_asr_ctx->n_decoding_mode) {
p_asr_ctx->p_params->strategy = WHISPER_SAMPLING_GREEDY;
p_asr_ctx->p_params->greedy.best_of = 1;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
} else {
p_asr_ctx->p_params->strategy = WHISPER_SAMPLING_BEAM_SEARCH;
p_asr_ctx->p_params->beam_search.beam_size = 5;//https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
p_asr_ctx->p_params->greedy.best_of = 5;
}
//LOGGD("decoding_mode=%d, audio_ctx=%d\n", p_asr_ctx->n_decoding_mode, p_asr_ctx->p_params->audio_ctx);

result = whisper_full(p_asr_ctx->p_context, *p_asr_ctx->p_params, pf32_audio_buffer, num_samples);
if (0 != result) {
LOGW("whisper inference failure, pls check why?\n");
Expand Down Expand Up @@ -1350,9 +1370,19 @@ int whisper_asr_init(const char * sz_model_path, int n_threads, int n_asrmode) {
params.speed_up = false;
params.debug_mode = false;

params.audio_ctx = 0;

params.suppress_blank = false;
//params.suppress_non_speech_tokens = true;
//params.language = "en";

//03-20-2024,referenced by:https://github.com/futo-org/whisper-acft
p_asr_ctx->n_decoding_mode = WHISPER_SAMPLING_GREEDY;


//params.tdrz_enable = false;//whisper complain failed to compute log mel spectrogram when this flag was enabled
//params.suppress_blank = true;
//params.suppress_non_speech_tokens = true;
params.suppress_non_speech_tokens = true;

memcpy(p_asr_ctx->p_params, &params, sizeof(struct whisper_full_params));

Expand Down

2 comments on commit 4cd35dd

@zhouwg
Copy link
Owner Author

@zhouwg zhouwg commented on 4cd35dd Mar 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this new fine-tune method was introduced in ggerganov/whisper.cpp#1951

and the meaning of parameter could be found at:

https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h#L489

performance of real-time transcription on Xiaomi14 was improved very significantly by this new fine-tune method,

before fine-tune:

Screenshot from 2024-03-16 21-18-24

after fine-tune:

Screenshot from 2024-03-20 16-40-19

but this fine-tune also brings an unexpected side-effect:whispercpp would produce incorrect/repeat tokens or app would crash many times: the inference performance is NOT better then this method but code in previous commit is very robust------almost NO crash

@zhouwg
Copy link
Owner Author

@zhouwg zhouwg commented on 4cd35dd Mar 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, generally speaking or frankly speaking, I really do not know the real meaning of key-point code in this commit because I really know very very little about real/hardcore(not a parameter-finetune engineer but like the great Georgi Gerganov:real AI expert and real modern C++ master) AI tech.

I will revert this commit in the master branch accordingly.

Please sign in to comment.