whisper : add integer quantization support (ggerganov#540)

* whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples
landtanin · Apr 30, 2023 · 7a0d614 · 7a0d614
1 parent 8d56873
commit 7a0d614
Show file tree

Hide file tree

Showing 41 changed files with 3,180 additions and 1,007 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ build-sanitize-thread/
 /talk
 /talk-llama
 /bench
+/quantize
 
 arm_neon.h
 sync.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -303,6 +303,12 @@ if (BUILD_SHARED_LIBS)
 
     target_compile_definitions(${TARGET} PUBLIC
         WHISPER_SHARED
+        GGML_SHARED
+        )
+
+    target_compile_definitions(${TARGET} PRIVATE
+        WHISPER_BUILD
+        GGML_BUILD
         )
 endif()
 

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-default: main bench
+default: main bench quantize
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -243,15 +243,15 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 
 clean:
-	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
 
 #
 # Examples
 #
 
 CC_SDL=`sdl2-config --cflags --libs`
 
-SRC_COMMON = examples/common.cpp
+SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
 
 main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@@ -261,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
 bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
 
+quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
+
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
 

diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
+- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
 - Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Runs on the CPU
@@ -228,6 +229,22 @@ make large
 | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 
+## Quantization
+
+`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
+Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
+
+Here are the steps for creating and using a quantized model:
+
+```bash
+# quantize a model with Q5_0 method
+make quantize
+./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
+
+# run the examples as usual, specifying the quantized model file
+./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
+```
+
 ## Core ML support
 
 On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant

diff --git a/bindings/javascript/whisper.js b/bindings/javascript/whisper.js
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -21,10 +21,14 @@ set(TARGET common)
 add_library(${TARGET} STATIC
     common.h
     common.cpp
+    common-ggml.h
+    common-ggml.cpp
     )
 
 include(DefaultTargetOptions)
 
+target_link_libraries(${TARGET} PRIVATE whisper)
+
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 if (WHISPER_SDL2)
@@ -62,6 +66,7 @@ else()
     add_subdirectory(stream)
     add_subdirectory(command)
     add_subdirectory(bench)
+    add_subdirectory(quantize)
     add_subdirectory(talk)
     add_subdirectory(talk-llama)
 endif()
diff --git a/examples/addon.node/__test__/whisper.spec.js b/examples/addon.node/__test__/whisper.spec.js
@@ -14,9 +14,10 @@ const whisperParamsMock = {
 };
 
 describe("Run whisper.node", () => {
-  test("it should receive a non-empty value", async () => {
-    let result = await whisperAsync(whisperParamsMock);
+    test("it should receive a non-empty value", async () => {
+        let result = await whisperAsync(whisperParamsMock);
 
-    expect(result.length).toBeGreaterThan(0);
-  });
+        expect(result.length).toBeGreaterThan(0);
+    }, 10000);
 });
+
diff --git a/examples/bench.wasm/CMakeLists.txt b/examples/bench.wasm/CMakeLists.txt
@@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
     --bind \
     -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
-    -s TOTAL_MEMORY=1024MB \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
+    -s INITIAL_MEMORY=2000MB \
+    -s TOTAL_MEMORY=2000MB \
     -s FORCE_FILESYSTEM=1 \
     -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
     ${EXTRA_FLAGS} \

diff --git a/examples/bench.wasm/index-tmpl.html b/examples/bench.wasm/index-tmpl.html
@@ -35,6 +35,15 @@
 
             <br><br>
 
+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
             <hr>
 
             Select the model you would like to use and click the "Bench" button.<br>
@@ -44,11 +53,18 @@
 
             <div id="model-whisper">
                 Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
-                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
+                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                 <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
+                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
+                <span id="fetch-whisper-progress"></span>
             </div>
 
             <br>
@@ -160,6 +176,14 @@
 
                 document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                 document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
                 document.getElementById('whisper-file'         ).style.display = 'none';
                 document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
             }
@@ -168,19 +192,42 @@
                 let urls = {
                     'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                     'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
+                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
+                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
+                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                 };
 
                 let sizes = {
                     'tiny.en': 75,
                     'base.en': 142,
+                    'small.en': 466,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
+                    'small-en-q5_1':  182,
+                    'medium-en-q5_0': 515,
+                    'large-q5_0':     1030,
                 };
 
                 let url     = urls[model];
                 let dst     = 'whisper.bin';
                 let size_mb = sizes[model];
 
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-en').style.display  = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display  = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
+                document.getElementById('whisper-file'        ).style.display = 'none';
                 document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
 
                 cbProgress = function(p) {
@@ -190,9 +237,18 @@
 
                 cbCancel = function() {
                     var el;
-                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                    el = document.getElementById('fetch-whisper-tiny-en');  if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en');  if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                 };
 
                 loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);

diff --git a/examples/command.wasm/index-tmpl.html b/examples/command.wasm/index-tmpl.html
@@ -35,6 +35,15 @@
 
             <br><br>
 
+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
             <hr>
 
             Select the model you would like to use, click the "Start" button and follow the instructions.
@@ -45,6 +54,10 @@
                 Whisper model: <span id="model-whisper-status"></span>
                 <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                 <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                 <span id="fetch-whisper-progress"></span>
 
                 <!--
@@ -162,11 +175,17 @@
                 let urls = {
                     'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                     'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                 };
 
                 let sizes = {
                     'tiny.en': 75,
                     'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                 };
 
                 let url     = urls[model];
@@ -177,6 +196,10 @@
 
                 document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                 document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                 document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
 
                 cbProgress = function(p) {
@@ -188,6 +211,10 @@
                     var el;
                     el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                     el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                     el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,6 +23,7 @@ build-sanitize-thread/ @@
     /talk
     /talk-llama
     /bench
+    /quantize
     arm_neon.h
     sync.sh
@@ Expand Down @@