From ed667e95816a74126252cca9a7f6390a3cf3ace0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Sun, 2 Apr 2023 15:59:14 +0200 Subject: [PATCH 1/6] quantize-stats command Command that calculates some statistics over the errors introduced by quantization, at the moment mean square error and max error for layer weights. Should be useful for testing quantization improvements. Needs some internal state from ggml and llama that should not be part of the public API. --- .gitignore | 1 + Makefile | 5 +- examples/CMakeLists.txt | 1 + examples/quantize-stats/CMakeLists.txt | 4 + examples/quantize-stats/quantize-stats.cpp | 321 +++++++++++++++++++++ ggml.c | 17 +- ggml_internal.h | 25 ++ llama.cpp | 6 + llama_internal.h | 13 + 9 files changed, 382 insertions(+), 11 deletions(-) create mode 100644 examples/quantize-stats/CMakeLists.txt create mode 100644 examples/quantize-stats/quantize-stats.cpp create mode 100644 ggml_internal.h create mode 100644 llama_internal.h diff --git a/.gitignore b/.gitignore index 1c75d38d11d1a..3b8d02c30ff60 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ models/* /main /quantize +/quantize-stats /result /perplexity /embedding diff --git a/Makefile b/Makefile index 2f828bf10d747..6745cdeefff74 100644 --- a/Makefile +++ b/Makefile @@ -148,7 +148,7 @@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -159,6 +159,9 @@ main: examples/main/main.cpp ggml.o llama.o common.o quantize: examples/quantize/quantize.cpp ggml.o llama.o $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) + perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ce3a34710a5fe..67a7cea543a40 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -31,6 +31,7 @@ if (EMSCRIPTEN) else() add_subdirectory(main) add_subdirectory(quantize) + add_subdirectory(quantize-stats) add_subdirectory(perplexity) add_subdirectory(embedding) endif() diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt new file mode 100644 index 0000000000000..7bebc11a124b0 --- /dev/null +++ b/examples/quantize-stats/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize-stats) +add_executable(${TARGET} quantize-stats.cpp) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp new file mode 100644 index 0000000000000..e922c3a7a7434 --- /dev/null +++ b/examples/quantize-stats/quantize-stats.cpp @@ -0,0 +1,321 @@ +#include "ggml.h" +#include "ggml_internal.h" +#include "llama.h" +#include "llama_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; +static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); + +struct quantize_stats_params { + std::string model = "models/7B/ggml-model-f16.bin"; + bool verbose = false; + bool per_layer_stats = false; + bool print_histogram = false; + std::vector include_layers; + std::vector exclude_layers; + std::vector include_types; +}; + +const size_t HISTOGRAM_BUCKETS = 30; +const double HISTOGRAM_RANGE = 0.03; + +struct error_stats { + size_t num_samples; + double total_error; + double max_error; + uint64_t error_histogram[HISTOGRAM_BUCKETS]; +}; + + +void quantize_stats_print_usage(int /*argc*/, char ** argv) { + quantize_stats_params params; + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -v, --verbose\n"); + fprintf(stderr, " verbose output (default: false)\n"); + fprintf(stderr, " -p, --per-layer-stats\n"); + fprintf(stderr, " print stats per layer (default: false)\n"); + fprintf(stderr, " --histogram\n"); + fprintf(stderr, " print error histogram (default: false)\n"); + fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); + fprintf(stderr, " only test layers containing substring\n"); + fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); + fprintf(stderr, " exclude layers containing substring\n"); + fprintf(stderr, " -t TYPE, --type TYPE\n"); + fprintf(stderr, " only test given type (q4_0, q4_1)\n"); + fprintf(stderr, "\n"); +} + +// Check if a layer is included/excluded by command line +bool layer_included(const quantize_stats_params params, const std::string & layer) { + for (const auto& excluded : params.exclude_layers) { + if (layer.find(excluded) != std::string::npos) { + return false; + } + } + for (const auto& included : params.include_layers) { + if (layer.find(included) != std::string::npos) { + return true; + } + } + return params.include_layers.empty(); +} + +// Update error statistics given vectors with the before/after result of quantization +void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { + for (int64_t i = 0; i < nelements; i++) { + double diff = input[i] - output[i]; + stats.total_error += diff * diff; + stats.max_error = fmax(fabs(diff), stats.max_error); + stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; + } + stats.num_samples += nelements; +} + +void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { + printf("%-50s: mse %.8f, maxerr %.8f\n", name.c_str(), stats.total_error / (double) stats.num_samples, stats.max_error); + if (print_histogram) { + printf("Error distribution:\n"); + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; + printf("[%3.3f, %3.3f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); + } + } +} + +// copied from ggml.h - verify that we can access this as a flat array +static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +// Run quantization function for a single layer and update error stats +void test_roundtrip_on_layer( + std::string & name, + bool print_layer_stats, + const quantize_fns_t & qfns, + const ggml_tensor * layer, + float * input_scratch, + char *quantized_scratch, + float * output_scratch, + error_stats & total_error) { + + assert(tensor_is_contiguous(layer)); + int64_t nelements = ggml_nelements(layer); + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < nelements; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i); + } + } else { + input_scratch = ggml_get_data_f32(layer); + } + + qfns.quantize_row_q(input_scratch, quantized_scratch, nelements); + qfns.dequantize_row_q(quantized_scratch, output_scratch, nelements); + + update_error_stats(nelements, input_scratch, output_scratch, total_error); + if (print_layer_stats) { + error_stats layer_error {}; + update_error_stats(nelements, input_scratch, output_scratch, layer_error); + print_error_stats(name, layer_error, false); + } +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + quantize_stats_params params; + + // read command line + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + quantize_stats_print_usage(argc, argv); + exit(0); + } else if (arg == "-v") { + params.verbose = true; + } else if (arg == "-p" || arg == "--per-layer-stats") { + params.per_layer_stats = true; + } else if (arg == "--histogram") { + params.print_histogram = true; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-l" || arg == "--include-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.include_layers.push_back(argv[i]); + } else if (arg == "-L" || arg == "--exclude-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.exclude_layers.push_back(argv[i]); + } else if (arg == "-t" || arg == "--type") { + if (++i >= argc) { + invalid_param = true; + break; + } + int j; + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { + // find match + } + if (j < GGML_TYPE_COUNT) { + params.include_types.push_back((ggml_type) j); + } else { + fprintf(stderr, "error: %s not in list of types\n", argv[i]); + invalid_param = true; + } + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + + // load the model + fprintf(stderr, "Loading model\n"); + + const int64_t t_main_start_us = ggml_time_us(); + llama_context * ctx; + + { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = 256; + lparams.n_parts = 1; + lparams.seed = 1; + lparams.f16_kv = false; + lparams.use_mlock = false; + + ctx = llama_init_from_file(params.model.c_str(), lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + } + + // Sort tensors for consistent output + const auto tensors = llama_internal_get_tensor_map(ctx); + std::map tensors_sorted { tensors.begin(), tensors.end() }; + + // check layer tensors + int included_layers = 0; + int64_t max_nelements = 0; + bool is_f16 = false; + for (const auto& kv_tensor : tensors_sorted) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); + } + if (kv_tensor.second->type == GGML_TYPE_F16) { + is_f16 = true; + } else if (kv_tensor.second->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: error: Quantization should be tested with a float model, " + "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); + llama_free(ctx); + return 1; + } + included_layers++; + max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); + } + + if (is_f16) { + printf("note: source model is f16\n"); + } + printf("testing %d layers with max size %" PRId64 ", allocating %" PRId64 " bytes\n", included_layers, max_nelements, 3*4*max_nelements); + // allocate scratch space + std::vector input_scratch(max_nelements); + std::vector quantized_scratch(max_nelements*4); + std::vector output_scratch(max_nelements); + + // loop throught quantization types + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { + continue; + } + quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + if (qfns.quantize_row_q && qfns.dequantize_row_q) { + if (params.verbose) { + printf("testing %s ...\n", type_strs[i]); + } + + error_stats global_stats {}; + + for (const auto& kv_tensor : tensors_sorted) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf(" %s ...\n", kv_tensor.first.c_str()); + } + std::string layer_name { type_strs[i] }; + layer_name += "::" + kv_tensor.first; + test_roundtrip_on_layer( + layer_name, + params.per_layer_stats, + qfns, + kv_tensor.second, + input_scratch.data(), + quantized_scratch.data(), + output_scratch.data(), + global_stats + ); + } + + print_error_stats(type_strs[i], global_stats, params.print_histogram); + } + } + + + llama_free(ctx); + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + return 0; +} diff --git a/ggml.c b/ggml.c index 63aa5eb6eb0f8..3a28616ffa30b 100644 --- a/ggml.c +++ b/ggml.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include "ggml.h" +#include "ggml_internal.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -6496,16 +6497,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); -typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); -typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); - -typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - vec_dot_q_t vec_dot_q; -} quantize_fns_t; - static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0, @@ -6519,6 +6510,12 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { }, }; +// For internal test use +quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { + GGML_ASSERT(i < GGML_TYPE_COUNT); + return quantize_fns[i]; +} + static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, diff --git a/ggml_internal.h b/ggml_internal.h new file mode 100644 index 0000000000000..0761bad3e01f2 --- /dev/null +++ b/ggml_internal.h @@ -0,0 +1,25 @@ +#pragma once + +// Internal functions exposed for tests and benchmarks + +#ifdef __cplusplus +// restrict not standard in C++ +#define restrict +extern "C" { +#endif + +typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); +typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); +typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); + +typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + vec_dot_q_t vec_dot_q; +} quantize_fns_t; + +quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + +#ifdef __cplusplus +} +#endif diff --git a/llama.cpp b/llama.cpp index 854bb8993fbc5..b736dd88a26ff 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,4 +1,5 @@ #include "llama.h" +#include "llama_internal.h" #include "ggml.h" @@ -1854,3 +1855,8 @@ const char * llama_print_system_info(void) { return s.c_str(); } + +// For internal test use +std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx) { + return ctx->model.tensors; +} diff --git a/llama_internal.h b/llama_internal.h new file mode 100644 index 0000000000000..25c8c2c8746ab --- /dev/null +++ b/llama_internal.h @@ -0,0 +1,13 @@ +#ifndef LLAMA_INTERNAL_H +#define LLAMA_INTERNAL_H + +// Internal functions exposed for tests and benchmarks + +#include "ggml.h" + +#include +#include + +std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif From 5b1143ed932f1b9f8e77256df075e25ca72321f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Mon, 3 Apr 2023 17:32:03 +0200 Subject: [PATCH 2/6] quantize-stats: show percentiles Show some error percentiles, should be less noisy than just the max error. --- examples/quantize-stats/quantize-stats.cpp | 23 +++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index e922c3a7a7434..22860441d5f56 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,7 @@ struct quantize_stats_params { std::vector include_types; }; -const size_t HISTOGRAM_BUCKETS = 30; +const size_t HISTOGRAM_BUCKETS = 150; const double HISTOGRAM_RANGE = 0.03; struct error_stats { @@ -87,15 +88,31 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } +double find_quantile(const error_stats & stats, double quantile) { + double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); + + double accum = 0; + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + accum += stats.error_histogram[i]; + if (accum >= sum*quantile) { + return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + } + } + return INFINITY; +} + void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { - printf("%-50s: mse %.8f, maxerr %.8f\n", name.c_str(), stats.total_error / (double) stats.num_samples, stats.max_error); + double rms = stats.total_error / (double) stats.num_samples; + double median = find_quantile(stats, .5); + double pct95 = find_quantile(stats, .95); + printf("%-50s: mse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rms, stats.max_error, pct95, median); if (print_histogram) { printf("Error distribution:\n"); for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; - printf("[%3.3f, %3.3f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); + printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); } } } From a7d3c3f304064b2f779733bfd43d1f3de2933046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Tue, 4 Apr 2023 00:21:01 +0200 Subject: [PATCH 3/6] quantize-stats: use less scratch memory Test quantization in smaller chunks instead of layer-at-a-time. --- examples/quantize-stats/quantize-stats.cpp | 37 +++++++++++++--------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 22860441d5f56..6d8cf95a6890e 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -28,6 +28,7 @@ struct quantize_stats_params { std::vector include_types; }; +const int64_t SCRATCH_ELEMENTS = 32*32; const size_t HISTOGRAM_BUCKETS = 150; const double HISTOGRAM_RANGE = 0.03; @@ -140,23 +141,29 @@ void test_roundtrip_on_layer( error_stats & total_error) { assert(tensor_is_contiguous(layer)); + error_stats layer_error {}; int64_t nelements = ggml_nelements(layer); - if (layer->type == GGML_TYPE_F16) { - for (int i = 0; i < nelements; i++) { - input_scratch[i] = ggml_get_f32_1d(layer, i); + for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) { + int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset); + + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; } - } else { - input_scratch = ggml_get_data_f32(layer); - } - qfns.quantize_row_q(input_scratch, quantized_scratch, nelements); - qfns.dequantize_row_q(quantized_scratch, output_scratch, nelements); + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); - update_error_stats(nelements, input_scratch, output_scratch, total_error); + update_error_stats(chunk_size, input_scratch, output_scratch, total_error); + if (print_layer_stats) { + update_error_stats(chunk_size, input_scratch, output_scratch, layer_error); + } + } if (print_layer_stats) { - error_stats layer_error {}; - update_error_stats(nelements, input_scratch, output_scratch, layer_error); print_error_stats(name, layer_error, false); } } @@ -280,11 +287,11 @@ int main(int argc, char ** argv) { if (is_f16) { printf("note: source model is f16\n"); } - printf("testing %d layers with max size %" PRId64 ", allocating %" PRId64 " bytes\n", included_layers, max_nelements, 3*4*max_nelements); + printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); // allocate scratch space - std::vector input_scratch(max_nelements); - std::vector quantized_scratch(max_nelements*4); - std::vector output_scratch(max_nelements); + std::vector input_scratch(SCRATCH_ELEMENTS); + std::vector quantized_scratch(SCRATCH_ELEMENTS*4); + std::vector output_scratch(SCRATCH_ELEMENTS); // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { From d4915074c473c5c4fa9c538292bc1e569d2fd222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Tue, 4 Apr 2023 00:33:09 +0200 Subject: [PATCH 4/6] quantize-stats: misc improvements Show RMSE instead of MSE - keeps similar range to the other metrics. Regex match on layer pattern. --- examples/quantize-stats/quantize-stats.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6d8cf95a6890e..6ec7ce67de8b9 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -55,9 +56,9 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { fprintf(stderr, " --histogram\n"); fprintf(stderr, " print error histogram (default: false)\n"); fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); - fprintf(stderr, " only test layers containing substring\n"); + fprintf(stderr, " only test layers matching pattern\n"); fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); - fprintf(stderr, " exclude layers containing substring\n"); + fprintf(stderr, " exclude layers matching pattern\n"); fprintf(stderr, " -t TYPE, --type TYPE\n"); fprintf(stderr, " only test given type (q4_0, q4_1)\n"); fprintf(stderr, "\n"); @@ -66,12 +67,12 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { // Check if a layer is included/excluded by command line bool layer_included(const quantize_stats_params params, const std::string & layer) { for (const auto& excluded : params.exclude_layers) { - if (layer.find(excluded) != std::string::npos) { + if (std::regex_search(layer, std::regex(excluded))) { return false; } } for (const auto& included : params.include_layers) { - if (layer.find(included) != std::string::npos) { + if (std::regex_search(layer, std::regex(included))) { return true; } } @@ -103,10 +104,10 @@ double find_quantile(const error_stats & stats, double quantile) { } void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { - double rms = stats.total_error / (double) stats.num_samples; + double rmse = sqrt(stats.total_error / (double) stats.num_samples); double median = find_quantile(stats, .5); double pct95 = find_quantile(stats, .95); - printf("%-50s: mse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rms, stats.max_error, pct95, median); + printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); if (print_histogram) { printf("Error distribution:\n"); for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { From 63cfa43200860fa81a90439d172173f93f6fc994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Wed, 5 Apr 2023 03:30:23 +0200 Subject: [PATCH 5/6] quantize-stats: add option to test against reference quantization Expose reference quantization implementation and add option to use it for tests. --- examples/quantize-stats/quantize-stats.cpp | 13 ++++++++++++- ggml.c | 14 ++++++++------ ggml_internal.h | 1 + 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6ec7ce67de8b9..924058fd8c605 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -24,6 +24,7 @@ struct quantize_stats_params { bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; + bool reference = false; std::vector include_layers; std::vector exclude_layers; std::vector include_types; @@ -49,6 +50,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -r, --reference\n"); + fprintf(stderr, " use reference implementation (default: false)\n"); fprintf(stderr, " -v, --verbose\n"); fprintf(stderr, " verbose output (default: false)\n"); fprintf(stderr, " -p, --per-layer-stats\n"); @@ -135,6 +138,7 @@ void test_roundtrip_on_layer( std::string & name, bool print_layer_stats, const quantize_fns_t & qfns, + bool use_reference, const ggml_tensor * layer, float * input_scratch, char *quantized_scratch, @@ -156,7 +160,11 @@ void test_roundtrip_on_layer( input_scratch = ggml_get_data_f32(layer) + offset; } - qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + if (use_reference) { + qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); + } else { + qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); + } qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); update_error_stats(chunk_size, input_scratch, output_scratch, total_error); @@ -184,6 +192,8 @@ int main(int argc, char ** argv) { if (arg == "-h" || arg == "--help") { quantize_stats_print_usage(argc, argv); exit(0); + } else if (arg == "-r" || arg == "--reference") { + params.reference = true; } else if (arg == "-v") { params.verbose = true; } else if (arg == "-p" || arg == "--per-layer-stats") { @@ -320,6 +330,7 @@ int main(int argc, char ** argv) { layer_name, params.per_layer_stats, qfns, + params.reference, kv_tensor.second, input_scratch.data(), quantized_scratch.data(), diff --git a/ggml.c b/ggml.c index 3a28616ffa30b..de986e591fd68 100644 --- a/ggml.c +++ b/ggml.c @@ -6499,14 +6499,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .vec_dot_q = ggml_vec_dot_q4_0, + .dequantize_row_q = dequantize_row_q4_0, + .quantize_row_q = quantize_row_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .vec_dot_q = ggml_vec_dot_q4_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .vec_dot_q = ggml_vec_dot_q4_1, + .dequantize_row_q = dequantize_row_q4_1, + .quantize_row_q = quantize_row_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .vec_dot_q = ggml_vec_dot_q4_1, }, }; diff --git a/ggml_internal.h b/ggml_internal.h index 0761bad3e01f2..6bfa441d5f6a9 100644 --- a/ggml_internal.h +++ b/ggml_internal.h @@ -15,6 +15,7 @@ typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restri typedef struct { dequantize_row_q_t dequantize_row_q; quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; vec_dot_q_t vec_dot_q; } quantize_fns_t; From 41d4a863c901f69045d260991a08a0f202581edc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Wed, 5 Apr 2023 22:18:58 +0200 Subject: [PATCH 6/6] Remove "internal" header files Move into main header with comment not to use, per PR feedback --- examples/quantize-stats/quantize-stats.cpp | 2 -- ggml.c | 1 - ggml.h | 24 ++++++++++++++++++++ ggml_internal.h | 26 ---------------------- llama.cpp | 1 - llama.h | 7 ++++++ llama_internal.h | 13 ----------- 7 files changed, 31 insertions(+), 43 deletions(-) delete mode 100644 ggml_internal.h delete mode 100644 llama_internal.h diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 924058fd8c605..af1e6272e80b2 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,5 @@ #include "ggml.h" -#include "ggml_internal.h" #include "llama.h" -#include "llama_internal.h" #include #include diff --git a/ggml.c b/ggml.c index de986e591fd68..7b017f8e75bfd 100644 --- a/ggml.c +++ b/ggml.c @@ -2,7 +2,6 @@ #define _GNU_SOURCE #include "ggml.h" -#include "ggml_internal.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW diff --git a/ggml.h b/ggml.h index ad962b109ea89..f2567d7dabe6e 100644 --- a/ggml.h +++ b/ggml.h @@ -773,6 +773,30 @@ int ggml_cpu_has_blas(void); int ggml_cpu_has_sse3(void); int ggml_cpu_has_vsx(void); + +// +// Internal types and functions exposed for tests and benchmarks +// + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif +typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + +typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; + vec_dot_q_t vec_dot_q; +} quantize_fns_t; + +quantize_fns_t ggml_internal_get_quantize_fn(size_t i); + #ifdef __cplusplus } #endif diff --git a/ggml_internal.h b/ggml_internal.h deleted file mode 100644 index 6bfa441d5f6a9..0000000000000 --- a/ggml_internal.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -// Internal functions exposed for tests and benchmarks - -#ifdef __cplusplus -// restrict not standard in C++ -#define restrict -extern "C" { -#endif - -typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k); -typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k); -typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y); - -typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - quantize_row_q_t quantize_row_q_reference; - vec_dot_q_t vec_dot_q; -} quantize_fns_t; - -quantize_fns_t ggml_internal_get_quantize_fn(size_t i); - -#ifdef __cplusplus -} -#endif diff --git a/llama.cpp b/llama.cpp index b736dd88a26ff..bc1f0101174f7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,5 +1,4 @@ #include "llama.h" -#include "llama_internal.h" #include "ggml.h" diff --git a/llama.h b/llama.h index 04e2bf71cd9c0..deb09fe53959d 100644 --- a/llama.h +++ b/llama.h @@ -164,6 +164,13 @@ extern "C" { #ifdef __cplusplus } + +#include +#include +// +// Internal function exposed for tests and benchmarks +// +std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx); #endif #endif diff --git a/llama_internal.h b/llama_internal.h deleted file mode 100644 index 25c8c2c8746ab..0000000000000 --- a/llama_internal.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LLAMA_INTERNAL_H -#define LLAMA_INTERNAL_H - -// Internal functions exposed for tests and benchmarks - -#include "ggml.h" - -#include -#include - -std::unordered_map& llama_internal_get_tensor_map(struct llama_context * ctx); - -#endif