From d1b68d8613955f7cc5a50cf6998b052bd889a56c Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Sep 2021 16:49:13 +0800 Subject: [PATCH 1/2] update DFQ/EQ/Evaluate int8 perchannel quant tool --- tools/quantize/CMakeLists.txt | 2 + tools/quantize/README.md | 34 +- tools/quantize/algorithm/quant_dfq.cpp | 576 +++++++++++ tools/quantize/algorithm/quant_eq.cpp | 1297 ++++++++++++++++++++++++ tools/quantize/quant_save_graph.cpp | 5 + tools/quantize/quant_tool.hpp | 143 ++- tools/quantize/quant_tool_int8.cpp | 241 ++++- tools/quantize/quant_utils.cpp | 18 +- tools/quantize/quant_utils.hpp | 1 + 9 files changed, 2290 insertions(+), 27 deletions(-) create mode 100644 tools/quantize/algorithm/quant_dfq.cpp create mode 100644 tools/quantize/algorithm/quant_eq.cpp diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index ba8d3d961..b8504e500 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -30,6 +30,8 @@ IF (${TENGINE_TARGET_PROCESSOR} MATCHES "X86") ADD_EXECUTABLE( ${name} ./quant_save_graph.cpp + ./algorithm/quant_dfq.cpp + ./algorithm/quant_eq.cpp ./quant_utils.cpp ../save_graph/save_graph.cpp ../save_graph/tm2_op_save.cpp diff --git a/tools/quantize/README.md b/tools/quantize/README.md index e0e1d7547..968dc1441 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -76,7 +76,7 @@ Status : int8, per-channel, symmetric Before use the quant tool, **you need Float32 tmfile and Calibration Dataset**, the image num of calibration dataset we suggest to use 500-1000. ``` -$ .quant_tool_int8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_int8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017 +$ .quant_tool_int8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_int8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017 -z 1 ---- Tengine Post Training Quantization Tool ---- @@ -111,6 +111,38 @@ Thread num : 1 [Quant Tools Info]: Step 4, quantize activation tensor done. [Quant Tools Info]: Step 5, quantize weight tensor done. [Quant Tools Info]: Step 6, save Int8 tmfile done, ./mobilenet_int8.tmfile +[Quant Tools Info]: Step Evaluate, evaluate quantitative losses +cosin 0 32 avg 0.995317 ### 0.000000 0.953895 0.998249 0.969256 ... +cosin 1 32 avg 0.982403 ### 0.000000 0.902383 0.964436 0.873998 ... +cosin 2 64 avg 0.976753 ### 0.952854 0.932301 0.982766 0.958503 ... +cosin 3 64 avg 0.981889 ### 0.976637 0.981754 0.987276 0.970671 ... +cosin 4 128 avg 0.979728 ### 0.993999 0.991858 0.990438 0.992766 ... +cosin 5 128 avg 0.970351 ### 0.772556 0.989541 0.986996 0.989563 ... +cosin 6 128 avg 0.954545 ### 0.950125 0.922964 0.946804 0.972852 ... +cosin 7 128 avg 0.977192 ### 0.994728 0.972071 0.995353 0.992700 ... +cosin 8 256 avg 0.977426 ### 0.968429 0.991248 0.991274 0.994450 ... +cosin 9 256 avg 0.962224 ### 0.985255 0.969171 0.958762 0.967461 ... +cosin 10 256 avg 0.954253 ### 0.984353 0.935643 0.656188 0.929778 ... +cosin 11 256 avg 0.971987 ### 0.997596 0.967681 0.476525 0.999115 ... +cosin 12 512 avg 0.972861 ### 0.968920 0.905907 0.993918 0.622953 ... +cosin 13 512 avg 0.959161 ### 0.935686 0.000000 0.642560 0.994388 ... +cosin 14 512 avg 0.963903 ### 0.979613 0.957169 0.976440 0.902512 ... +cosin 15 512 avg 0.963226 ### 0.977065 0.965819 0.998149 0.905297 ... +cosin 16 512 avg 0.960935 ### 0.861674 0.972926 0.950579 0.987609 ... +cosin 17 512 avg 0.961057 ### 0.738472 0.987884 0.999124 0.995397 ... +cosin 18 512 avg 0.960127 ### 0.935455 0.968909 0.970831 0.981240 ... +cosin 19 512 avg 0.963755 ### 0.972628 0.992305 0.999518 0.799737 ... +cosin 20 512 avg 0.949364 ### 0.922776 0.896038 0.945079 0.971338 ... +cosin 21 512 avg 0.961256 ### 0.902256 0.896438 0.923361 0.973974 ... +cosin 22 512 avg 0.946552 ### 0.963806 0.982075 0.878965 0.929992 ... +cosin 23 512 avg 0.953677 ### 0.953880 0.996364 0.936540 0.930796 ... +cosin 24 1024 avg 0.941197 ### 0.000000 0.992507 1.000000 0.994460 ... +cosin 25 1024 avg 0.973546 ### 1.000000 0.889181 0.000000 0.998084 ... +cosin 26 1024 avg 0.869351 ### 0.522966 0.000000 0.987009 0.000000 ... +cosin 27 1 avg 0.974982 ### 0.974982 +cosin 28 1 avg 0.974982 ### 0.974982 +cosin 29 1 avg 0.974982 ### 0.974982 +cosin 30 1 avg 0.978486 ### 0.978486 ---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\(^0^)/ ---- ``` diff --git a/tools/quantize/algorithm/quant_dfq.cpp b/tools/quantize/algorithm/quant_dfq.cpp new file mode 100644 index 000000000..66be8df81 --- /dev/null +++ b/tools/quantize/algorithm/quant_dfq.cpp @@ -0,0 +1,576 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "../quant_tool.hpp" + +//int QuantTool::data_free_quant(const char* model_file, const char* image_dir, +// int img_c, int img_h, int img_w, const float* mean, const float* scale, +// int num_thread, int sw_RGB, int center_crop) +int QuantTool::data_free_quant() +{ + int letterbox = 0; + int loop_count = 1; + const char* image_file = nullptr; + + + /* set runtime options */ + struct options opt; + opt.num_thread = num_thread; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_FP32; + +// /* inital tengine */ +// if (init_tengine() != 0) +// { +// fprintf(stderr, "Initial tengine failed.\n"); +// return -1; +// } +// fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); + + /* create graph, load tengine model xxx.tmfile */ + graph_t graph = create_graph(NULL, "tengine", model_file.c_str()); + if (NULL == graph) + { + fprintf(stderr, "Create graph failed.\n"); + fprintf(stderr, "errno: %d \n", get_tengine_errno()); + return -1; + } + + struct graph* graphn = (struct graph*)graph; + struct node_graph* node_proto = ( struct node_graph* )sys_malloc( sizeof(struct node_graph) * graphn->node_num); + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_type = n->op.type; + const char* layer_name = n->name; //layer name + + const uint16_t input_num = n->input_num; //input num + const uint16_t output_num = n->output_num; //output num + + node_proto[i].pass = 0; +// node_proto[i].input_node_list = create_vector(sizeof(uint16_t), NULL); +// node_proto[i].output_node_list = create_vector(sizeof(uint16_t), NULL); + + for (int j = 0; j < input_num; j++) + { + struct tensor* input_tensor = get_ir_graph_tensor(graphn, n->input_tensors[j]); + const char* input_tensor_name = input_tensor->name; + uint8_t dim_num = input_tensor->dim_num; + + if (input_tensor->producer >= 0) + { + struct node* node = graphn->node_list[input_tensor->producer]; + node_proto[i].input_node_list.push_back(node->index); + node_proto[node->index].output_node_list.push_back(i); + } + if (OP_CONV == op_type || OP_FC == op_type) + { + break; + } + } + } + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_type = n->op.type; + const char* layer_name = n->name; //layer name + if (op_type != NULL) + { + if (OP_CONV != op_type && OP_FC != op_type) + { + if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + uint16_t node_output_id = node_proto[i].output_node_list[0]; + if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1) + { + node_proto[i].input_node_list.erase(node_proto[i].input_node_list.begin() + 0); + node_proto[i].output_node_list.erase(node_proto[i].output_node_list.begin() + 0); + + node_proto[node_input_id].output_node_list.erase(node_proto[node_input_id].output_node_list.begin() + 0); + node_proto[node_input_id].output_node_list.push_back(node_output_id); + + node_proto[node_output_id].input_node_list.erase(node_proto[node_output_id].input_node_list.begin() + 0); + node_proto[node_output_id].input_node_list.push_back(node_input_id); + } + } + } + } + } + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_name = n->op.type; + const char* layer_name = n->name; //layer name + + const uint16_t input_num = n->input_num; //input num + const uint16_t output_num = n->output_num; //output num + + if (op_name != NULL) + { + if (OP_CONV == op_name) + { + // DW_Conv && Direct_Conv + struct conv_param* conv_param = ( struct conv_param* )n->op.param_mem; + if (conv_param->group == conv_param->output_channel) + { +// printf(" #### DW Conv ####\n"); + if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + uint16_t node_output_id = node_proto[i].output_node_list[0]; + auto op_name0 = graphn->node_list[node_input_id]->op.type; + auto op_name2 = graphn->node_list[node_input_id]->op.type; + + if (node_proto[node_input_id].output_node_list.size() == 1 && + node_proto[node_output_id].input_node_list.size() == 1 && + OP_CONV == op_name0 && OP_CONV == op_name2) + { + node_proto[i].pass = 1; //layer1 + node_proto[node_input_id].pass = 1; //layer0 + node_proto[node_output_id].pass = 1; //layer2 + + // layer0 min/max range + struct node* nodeP = graphn->node_list[node_input_id]; + struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + uint16_t dims0 = input_tensor->dims[0]; + uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer0_max(dims0, 0.0f); + std::vector layer0_min(dims0, 0.0f); + std::vector layer0_range(dims0, 0.0f); + + float* data_layer0 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) + layer0_max[d0] = data_layer0[dims123 * d0 + d1]; + if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) + layer0_min[d0] = data_layer0[dims123 * d0 + d1]; + } + } +// printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; + } + + // layer1 min/max range + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer1_max(dims0, 0.0f); + std::vector layer1_min(dims0, 0.0f); + std::vector layer1_range(dims0, 0.0f); + + float* data_layer1 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer1[dims123 * d0 + d1] > layer1_max[d0]) + layer1_max[d0] = data_layer1[dims123 * d0 + d1]; + if (data_layer1[dims123 * d0 + d1] < layer1_max[d0]) + layer1_min[d0] = data_layer1[dims123 * d0 + d1]; + } + } +// printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; + } + + // layer2 min/max range + nodeP = graphn->node_list[node_output_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + uint16_t dims1 = input_tensor->dims[1]; + uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer2_max(dims0, 0.0f); + std::vector layer2_min(dims0, 0.0f); + std::vector layer2_range(dims0, 0.0f); + + float* data_layer2 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer2_max[d1]) + { + layer2_max[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer2_min[d1]) + { + layer2_min[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + } + } + } +// printf("### %d ###\n",dims1); + for (int d1 = 0; d1 < dims1; d1++) + { + layer2_range[d1] = layer2_max[d1] - layer2_min[d1]; + } + +////////////////////////////////////////////////////////////////////////////////// + + // layer ops sqrt + float ops_range[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + ops_range[ops] = pow(layer0_range[ops] * layer1_range[ops] * layer2_range[ops], 1.0/3); + } + + float S01[dims1]; + float S01_F[dims1]; + float S12[dims1]; + float S12_F[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + if (ops_range[ops] == 0) + { + S01[ops] = 0.0; + S12_F[ops] = 0.0; + } + else + { + S01[ops] = layer0_range[ops]/ops_range[ops]; + S12_F[ops] = layer2_range[ops]/ops_range[ops]; + } + if (layer0_range[ops] == 0) + S01_F[ops] = 0.0; + else + S01_F[ops] = ops_range[ops]/layer0_range[ops]; + if (layer2_range[ops] == 0) + S12[ops] = 0.0; + else + S12[ops] = ops_range[ops]/layer2_range[ops]; + } +////////////////////////////////////////////////////////////////////////////////// + + // layer0 output + nodeP = graphn->node_list[node_input_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer0_bias = (float *)sys_malloc(sizeof(float) * dims0); + data_layer0_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; + } + + // layer1 output + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer1[dims123 * d0 + d1] = data_layer1[dims123 * d0 + d1] * S01[d0] * S12_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer1_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer1_bias[d0] = data_layer1_bias[d0] * S12_F[d0]; + } + + // layer2 output + nodeP = graphn->node_list[node_output_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims1 = input_tensor->dims[1]; + dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] * S12[d1]; + } + } + } + } + } + } + else + { +// printf(" #### Direct Conv ####\n"); + if (node_proto[i].pass == 0) + { + if (node_proto[i].input_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + if (graphn->node_list[node_input_id]->input_num > 0) + { + auto op_name0 = graphn->node_list[node_input_id]->op.type; + + if (node_proto[node_input_id].output_node_list.size() == 1 && + op_name0 == OP_CONV) + { + node_proto[i].pass = 1; //layer1 + node_proto[node_input_id].pass = 1; //layer0 + + // layer0 min/max range + struct node* nodeP = graphn->node_list[node_input_id]; + struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + uint16_t dims0 = input_tensor->dims[0]; + uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer0_max(dims0, 0.0f); + std::vector layer0_min(dims0, 0.0f); + std::vector layer0_range(dims0, 0.0f); + + float* data_layer0 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) + layer0_max[d0] = data_layer0[dims123 * d0 + d1]; + if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) + layer0_min[d0] = data_layer0[dims123 * d0 + d1]; + } + } +// printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; + } + + // layer1 min/max range + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + uint16_t dims1 = input_tensor->dims[1]; + uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer1_max(dims0, 0.0f); + std::vector layer1_min(dims0, 0.0f); + std::vector layer1_range(dims0, 0.0f); + + float* data_layer1 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer1_max[d1]) + { + layer1_max[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer1_min[d1]) + { + layer1_min[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + } + } + } +// printf("### %d ###\n",dims1); + for (int d0 = 0; d0 < dims1; d0++) + { + layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; + } + +////////////////////////////////////////////////////////////////////////////////// + + // layer ops sqrt + float ops_range[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + ops_range[ops] = sqrt(layer0_range[ops] * layer1_range[ops]); + } + + float S01[dims1]; + float S01_F[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + if (ops_range[ops] == 0) + { + S01[ops] = 0.0; + } + else + { + S01[ops] = layer0_range[ops]/ops_range[ops]; + } + if (layer0_range[ops] == 0) + S01_F[ops] = 0.0; + else + S01_F[ops] = ops_range[ops]/layer0_range[ops]; + } +////////////////////////////////////////////////////////////////////////////////// + // layer0 output + nodeP = graphn->node_list[node_input_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer0_bias = (float *)sys_malloc(sizeof(float) * dims0); + data_layer0_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; + } + + // layer1 output + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims1 = input_tensor->dims[1]; + dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] * S01[d1]; + } + } + } + } + } + } + } + } + } + } + } + + if (!save_graph(graph, "test_dfq_fp32.tmfile")) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + /* set the shape, data buffer of input_tensor of the graph */ + int img_size = img_h * img_w * img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data = ( float* )malloc(img_size * sizeof(float)); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if (input_tensor == NULL) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(graph, opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + std::vector imgs_list; + if (image_dir.c_str() != NULL) + { + readFileList(image_dir, imgs_list); + } + else + { + imgs_list.push_back(image_file); + } + uint32_t img_num = imgs_list.size(); + + /* prepare process input data, set the data mem to input tensor */ + get_input_data_cv(imgs_list[0].c_str(), input_data, img_c, img_h, img_w, mean, scale, + 1, 0, 0, 0, 0); + + /* run graph */ + for (int i = 0; i < loop_count; i++) + { + double start = get_current_time(); + if (run_graph(graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + } + + /* get the result of classification */ + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + float* output_data = ( float* )get_tensor_buffer(output_tensor); + int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); + +// printf("out put data %f %d \n",output_data[0], output_size); + fprintf(stderr, "--------------------------------------\n"); + + /* release tengine */ + free(input_data); + postrun_graph(graph); + destroy_graph(graph); +// release_tengine(); + + return 0; +} diff --git a/tools/quantize/algorithm/quant_eq.cpp b/tools/quantize/algorithm/quant_eq.cpp new file mode 100644 index 000000000..9d2b31178 --- /dev/null +++ b/tools/quantize/algorithm/quant_eq.cpp @@ -0,0 +1,1297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "../quant_tool.hpp" + +int QuantTool::init() +{ + // ir graph variable + this->fp32_out.clear(); + this->fake_quant_out.clear(); + + /* load fp32 graph and fake quant graph */ + this->graphn_fp32 = ( struct graph* )create_graph(nullptr, "tengine", this->model_file.c_str()); + this->graphn_fake_quant = ( struct graph* )create_graph(nullptr, "tengine", this->model_file.c_str()); + + if (this->graphn_fp32 == nullptr || this->graphn_fake_quant == nullptr) + { + fprintf(stderr, "Create graph failed.\n"); + fprintf(stderr, "errno: %d \n", get_tengine_errno()); + return -1; + } + + /* load activation scale to ir_tensor */ + this->load_activation_scale(this->graphn_fp32, this->scale_file.c_str(), this->inplace); + this->load_activation_scale(this->graphn_fake_quant, this->scale_file.c_str(), this->inplace); + + /* get graph input tensor */ + this->graph_input_tensor_fp32 = ( struct tensor* )get_graph_input_tensor(( void* )this->graphn_fp32, 0, 0); + this->graph_input_tensor_fake_quant = + ( struct tensor* )get_graph_input_tensor(( void* )this->graphn_fake_quant, 0, 0); + if (this->graph_input_tensor_fp32 == nullptr || this->graph_input_tensor_fake_quant == nullptr) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + /* generate images list */ + std::vector imgs_list; + if (!this->image_dir.empty()) + readFileList(this->image_dir, imgs_list); + uint32_t img_num = imgs_list.size(); + + this->max_search_img_num = 50; + if (img_num < this->max_search_img_num) + this->max_search_img_num = img_num; + +// fprintf(stderr, "# eq dataset num %d\n", this->max_search_img_num); + + /* set the shape, data buffer of input_tensor of the graph */ + this->img_size = this->img_h * this->img_w * this->img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data_fp32 = ( float* )malloc(this->img_size * sizeof(float)); + float* input_data_fake_quant = ( float* )malloc(this->img_size * sizeof(float)); + + /* prepare process input data, set the data mem to input tensor */ + float scale_graph_input = this->graph_input_tensor_fake_quant->scale; + int zero_point_graph_input = this->graph_input_tensor_fake_quant->zero_point; +// fprintf(stderr, "scale zp %f %d\n", scale_graph_input, zero_point_graph_input); + + this->input_datas_fp32.resize(this->max_search_img_num); + this->input_datas_fake_quant.resize(this->max_search_img_num); + + for (int i = 0; i < this->max_search_img_num; i++) + { + this->input_datas_fp32[i].resize(this->img_size); + this->input_datas_fake_quant[i].resize(this->img_size); + + get_input_data_cv(imgs_list[i].c_str(), this->input_datas_fp32[i].data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + + this->input_datas_fake_quant[i] = this->input_datas_fp32[i]; + this->activation_requant(this->input_datas_fake_quant[i].data(), this->img_size, 8, 1, scale_graph_input, + zero_point_graph_input); + } + + /* set graph input shape */ + int ret_fp32 = set_tensor_shape(this->graph_input_tensor_fp32, dims, 4); + int ret_fake_quant = set_tensor_shape(this->graph_input_tensor_fake_quant, dims, 4); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + /* set graph input buffer */ + ret_fp32 = set_tensor_buffer(this->graph_input_tensor_fp32, input_data_fp32, this->img_size * 4); + ret_fake_quant = set_tensor_buffer(this->graph_input_tensor_fake_quant, input_data_fake_quant, this->img_size * 4); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(( void* )this->graphn_fp32, this->opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + ret_fp32 = prerun_graph_multithread((void*)this->graphn_fp32, this->opt); + ret_fake_quant = prerun_graph_multithread((void*)this->graphn_fake_quant, this->opt); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + /* get exec graph */ + this->exec_graph_fp32 = this->get_exec_graph(this->graphn_fp32); + this->exec_graph_fake_quant = this->get_exec_graph(this->graphn_fake_quant); + this->exec_node_num = get_vector_num(this->exec_graph_fp32->exec_node_list); + + /* ir idx <<<->>> exec idx */ + for (int i = 0; i < this->exec_node_num; i++) + { + this->node_fp32 = ( struct exec_node* )get_vector_data(this->exec_graph_fp32->exec_node_list, i); + this->node_fake_quant = ( struct exec_node* )get_vector_data(this->exec_graph_fake_quant->exec_node_list, i); + + int out_t = node_fp32->ir_node->output_tensors[0]; + this->ir_exec[graphn_fp32->tensor_list[out_t]->producer] = i; // ir idx --> exec idx + this->exec_ir[i] = graphn_fp32->tensor_list[out_t]->producer; // exec idx --> ir idx +// printf(" %d : %d\n", graphn_fp32->tensor_list[out_t]->producer, i); + } + + /* check for free node*/ + this->check_for_free(); + + return 0; +} + +void QuantTool::activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point) +{ +// symmetry = 0; + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = 0; + } + + for (int i = 0; i < elem_num; i++) + { + data[i] = round(data[i] / scale) + zero_point; + data[i] = data[i] > fake_quant_max ? fake_quant_max : data[i]; + data[i] = data[i] < fake_quant_min ? fake_quant_min : data[i]; + data[i] = (data[i] - zero_point) * scale; + } +} + +void QuantTool::recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, + dict_str2int &layer_used, dict_str2float &layer_scale, dict_str2float &layer_zeropoint, dict_str2int &layer_pass) +{ + if (layer_pass[t->name] == 0 && layer_used[t->name] < 2) + { + t->scale = layer_scale[layer_name]; + t->zero_point = layer_zeropoint[layer_name]; + layer_scale[t->name] = layer_scale[layer_name]; + layer_zeropoint[t->name] = layer_zeropoint[layer_name]; + + uint32_t ir_node_idx = t->producer; + struct node* t_node = graphn->node_list[ir_node_idx]; + + auto op_name = t_node->op.type; + bool poolTrue = false; + bool reluTrue = false; + if (op_name == OP_POOL) + { + struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == OP_RELU) + { + struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || + poolTrue || reluTrue) + { + struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) + { + QuantTool::recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + layer_pass[t->name] = 1; + } +} + +struct exec_graph* QuantTool::get_exec_graph(struct graph* graphn) +{ + struct subgraph* subgraph = get_ir_graph_subgraph(graphn, 0); + struct exec_graph* exec_graph = ( struct exec_graph* )subgraph->device_graph; + + return exec_graph; +} + +void QuantTool::check_for_free() +{ + dict_uint2uint nodeA2B; + for (int i = 0; i < this->exec_node_num; i++) + { + this->node_fp32 = ( struct exec_node* )get_vector_data(this->exec_graph_fp32->exec_node_list, i); + this->op_name = this->node_fp32->ir_node->op.type; + + for (int j = 0; j < this->node_fp32->ir_node->input_num; j++) + { + struct tensor* t = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[j]]; + if (t->tensor_type == 1) + { + uint32_t ir_idx = t->producer; + nodeA2B[this->ir_exec[ir_idx]] = i; + } + } + } + + for (auto iter = nodeA2B.begin(); iter != nodeA2B.end(); iter++) + { + this->dict_free[iter->second].push_back(iter->first); +// printf(" map %d %d\n", iter->first, iter->second); + } +} + +void QuantTool::check_for_interlearve() +{ + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + /* get weight tensor */ + this->weight_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[1]]; + this->weight_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[1]]; + this->weight_size = this->weight_tensor_fp32->elem_num * this->weight_tensor_fp32->elem_size; + + this->weight_data_fp32 = (float*)this->weight_tensor_fp32->data; + this->weight_data_fake_quant = (float*)this->weight_tensor_fake_quant->data; + + if (this->op_name == OP_CONV) + { + this->conv_param_fp32 = ( struct conv_param* )this->node_fp32->ir_node->op.param_mem; + this->conv_param_fake_quant = ( struct conv_param* )this->node_fake_quant->ir_node->op.param_mem; + + if (this->conv_param_fp32->group != this->conv_param_fp32->output_channel) + { + this->conv_priv_info_fp32 = ( struct conv_priv_info* )this->node_fp32->ops_priv; + this->conv_priv_info_fake_quant = ( struct conv_priv_info* )this->node_fake_quant->ops_priv; + + this->interleave_size_fake = this->conv_priv_info_fp32->interleave_buffer_pack4_size; + + this->interleave_buffer_fp32 = ( float* )this->conv_priv_info_fp32->interleave_buffer_pack4; + this->interleave_buffer_fake_quant = ( float* )this->conv_priv_info_fake_quant->interleave_buffer_pack4; + } + } + else + this->interleave_size_fake = 0; + } +} + + +void QuantTool::weight_bias_requant(int search) +{ + /* weight requant */ +// printf("### 1.1 this->weight_tensor_fake_quant->scale %f\n",this->weight_tensor_fake_quant->scale); + if (0 == search) + this->weight_requant(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, this->weight_tensor_fake_quant->dims[0]); + + if (this->interleave_size_fake != 0) + { + int M = this->weight_tensor_fake_quant->dims[0]; + int K = this->weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; + this->conv_hcl_interleave_pack4_fp32(M, K, this->weight_data_fake_quant, this->interleave_buffer_fake_quant); + } + + /* bias requant */ + if (this->node_fake_quant->ir_node->input_num > 2) + { + this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[0]]; + this->bias_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[2]]; + this->bias_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[2]]; + this->bias_size = this->bias_tensor_fp32->elem_num * this->bias_tensor_fp32->elem_size; + this->bias_data_fp32 = (float*)this->bias_tensor_fp32->data; + this->bias_data_fake_quant = (float*)this->bias_tensor_fake_quant->data; + this->bias_requant(this->input_tensor_fake_quant, this->weight_tensor_fake_quant, this->bias_tensor_fake_quant, + this->bias_data_fake_quant, this->bias_tensor_fake_quant->elem_num, this->bias_tensor_fake_quant->dims[0]); +// this->bias_tensor_fp32->scale = this->bias_tensor_fake_quant->scale; + } +} + +void QuantTool::set_node_input_output_tensor(int idx, int imgi, int snum) +{ + this->out_imgs_fp32[imgi].resize(this->output_tensor_fp32->elem_num); + this->out_imgs_fake_quant[imgi].resize(this->output_tensor_fp32->elem_num); + + if (idx == 0) + { + set_tensor_buffer(this->graph_input_tensor_fp32, this->input_datas_fp32[imgi].data(), this->img_size * 4); + set_tensor_buffer(this->graph_input_tensor_fake_quant, this->input_datas_fake_quant[imgi].data(), this->img_size * 4); + } + else + { + for (int inputi = 0; inputi < this->node_fp32->ir_node->input_num; inputi++) + { + uint32_t ir_input_tensor_idx = this->node_fp32->ir_node->input_tensors[inputi]; + this->input_tensor_fp32 = this->graphn_fp32->tensor_list[ir_input_tensor_idx]; + this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[ir_input_tensor_idx]; + + if (this->input_tensor_fp32->tensor_type == 1) + { + uint32_t ir_node_idx = this->input_tensor_fp32->producer; + uint32_t input_size = this->input_tensor_fp32->elem_num * input_tensor_fp32->elem_size; + + uint32_t exec_node_idx = this->ir_exec[ir_node_idx]; + + if (imgi == 0 && snum == 0) + { + float* buf_fp32 = (float*)sys_malloc(32); + float* buf_fake_quant = (float*)sys_malloc(32); + + set_tensor_buffer(this->input_tensor_fp32, buf_fp32, input_size); + set_tensor_buffer(this->input_tensor_fake_quant, buf_fake_quant, input_size); + + set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); + set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); + } + else + { + set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); + set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); + } + } // output tensor + } // node input number + } // node i > 0 + + /* init output buffer */ + set_tensor_buffer(this->output_tensor_fp32, this->out_imgs_fp32[imgi].data(), this->output_tensor_fp32->elem_num * this->output_tensor_fp32->elem_size); + set_tensor_buffer(this->output_tensor_fake_quant, this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num * this->output_tensor_fake_quant->elem_size); +} + +double QuantTool::cosin_similarity(std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num) +{ + double norm_a=0; + double norm_b=0; + double a_b=0; + + uint32_t fnum = (output_num >> 4) << 4; + uint32_t rnum = output_num - fnum; + +#if 0 //__AVX__ + + float _sumaa0[8] = {0.f}; + float _sumbb0[8] = {0.f}; + float _sumaabb0[8] = {0.f}; + float _sumaa1[8] = {0.f}; + float _sumbb1[8] = {0.f}; + float _sumaabb1[8] = {0.f}; + + __m256 _suma_o0 = _mm256_set1_ps(0.0); + __m256 _sumb_o0 = _mm256_set1_ps(0.0); + __m256 _sumab_o0 = _mm256_set1_ps(0.0); + __m256 _suma_o1 = _mm256_set1_ps(0.0); + __m256 _sumb_o1 = _mm256_set1_ps(0.0); + __m256 _sumab_o1 = _mm256_set1_ps(0.0); + + for (int i = 0; i < imgs_num; i++) + { + const float* in_a_addr = in_a[i].data(); + const float* in_b_addr = in_b[i].data(); + for (int j = 0; j < fnum; j=j+32) + { + __m256 _in_a0 = _mm256_loadu_ps(in_a_addr+j); + __m256 _in_b0 = _mm256_loadu_ps(in_b_addr+j); + __m256 _in_a1 = _mm256_loadu_ps(in_a_addr+j+8); + __m256 _in_b1 = _mm256_loadu_ps(in_b_addr+j+8); + + _suma_o0 = _mm256_fmadd_ps(_in_a0, _in_a0, _suma_o0); + _sumb_o0 = _mm256_fmadd_ps(_in_b0, _in_b0, _sumb_o0); + _sumab_o0 = _mm256_fmadd_ps(_in_a0, _in_b0, _sumab_o0); + _suma_o1 = _mm256_fmadd_ps(_in_a1, _in_a1, _suma_o1); + _sumb_o1 = _mm256_fmadd_ps(_in_b1, _in_b1, _sumb_o1); + _sumab_o1 = _mm256_fmadd_ps(_in_a1, _in_b1, _sumab_o1); + } + } + _mm256_storeu_ps(_sumaa0, _suma_o0); + _mm256_storeu_ps(_sumbb0, _sumb_o0); + _mm256_storeu_ps(_sumaabb0, _sumab_o0); + _mm256_storeu_ps(_sumaa1, _suma_o1); + _mm256_storeu_ps(_sumbb1, _sumb_o1); + _mm256_storeu_ps(_sumaabb1, _sumab_o1); + + for (int i = 0; i < 8; i++) + { + norm_a += _sumaa0[i] + _sumaa1[i]; + norm_b += _sumbb0[i] + _sumbb1[i]; + a_b += _sumaabb0[i] + _sumaabb1[i]; + + } + +#else // normal +// printf("AAAA DIRECT\n"); + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < fnum; j=j+8) + { + for (int k = 0; k < 8; k=k+1) + { + norm_a += in_a[i][j+k] * in_a[i][j+k]; + + norm_b += in_b[i][j+k] * in_b[i][j+k]; + + a_b += in_a[i][j+k] * in_b[i][j+k]; + } + } + } + +#endif // __SSE__ __AVX__ + + for (int j = fnum; j < output_num; j++) + { + for (int i = 0; i < imgs_num; i++) + { + norm_a += in_a[i][j] * in_a[i][j]; + norm_b += in_b[i][j] * in_b[i][j]; + a_b += in_a[i][j] * in_b[i][j]; + } + } + + double cosin=0.0; + double _a_b_ = sqrt(norm_a) * sqrt(norm_b); + if(_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin = a_b; + else + cosin = a_b/_a_b_; + if (cosin < -999999 || cosin > 999999) + cosin = 0; + return cosin; +} + +double QuantTool::cosin_similarity(std::vector* in_a,std::vector* in_b, uint32_t imgs_num, uint32_t output_num) +{ + uint32_t output_channel = 1; + std::vector norm_a(output_channel, 0.0); + std::vector norm_b(output_channel, 0.0); + std::vector a_b(output_channel, 0.0); + + int elem_perchannel = int(output_num / output_channel); + + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < output_channel; j++) + { + for (int k = 0; k < elem_perchannel; k++) + { + int elem_idx = j * elem_perchannel + k; + norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; + norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; + a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; + } + } + } + + double cosin; + for (int j = 0; j < output_channel; j++) + { + double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); + if(_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin = a_b[j]; + else + cosin = a_b[j]/_a_b_; + if (cosin < -999999 || cosin > 999999) + cosin = 0; + } + return cosin; +} + +void QuantTool::weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) +{ + float* scale_list = (float *)sys_malloc(elem_channel * 4); + int* zero_point_list = (int *)sys_malloc(elem_channel * 4); + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = 0; + } + + float scale = 1; + int zero_point = 0; + for (int c = 0; c < elem_channel; c++) + { + float weight_max = *std::max_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); + float weight_min = *std::min_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); + if (symmetry == 1) + { + if (abs(weight_max) > abs(weight_min)) + scale = abs(weight_max)/fake_quant_max; + else + scale = abs(weight_min)/fake_quant_max; + zero_point = 0; + } + else + { + scale = (weight_max - weight_min)/fake_quant_max; + zero_point = int(- weight_min / scale); + } + + scale_list[c] = scale; + zero_point_list[c] = zero_point; + } + + if (weight_tensor->scale_list == NULL) + { +// printf(" EMPTY\n "); + weight_tensor->scale_list = scale_list; + weight_tensor->zp_list = zero_point_list; + } + else + { + scale_list = weight_tensor->scale_list; + zero_point_list = weight_tensor->zp_list; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i*elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale_list[i]) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale_list[i]; + } + } + } +} + +void QuantTool::conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t) +{ + int nn_outch = M >> 3; + int remain_outch_start = nn_outch << 3; + + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + const float* k0 = pA + (p + 0) * K; + const float* k1 = pA + (p + 1) * K; + const float* k2 = pA + (p + 2) * K; + const float* k3 = pA + (p + 3) * K; + const float* k4 = pA + (p + 4) * K; + const float* k5 = pA + (p + 5) * K; + const float* k6 = pA + (p + 6) * K; + const float* k7 = pA + (p + 7) * K; + + float* ktmp = pA_t + (p / 8) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp[1] = k1[0]; + ktmp[2] = k2[0]; + ktmp[3] = k3[0]; + ktmp[4] = k4[0]; + ktmp[5] = k5[0]; + ktmp[6] = k6[0]; + ktmp[7] = k7[0]; + ktmp += 8; + + k0 += 1; + k1 += 1; + k2 += 1; + k3 += 1; + k4 += 1; + k5 += 1; + k6 += 1; + k7 += 1; + } + } + + nn_outch = (M - remain_outch_start) >> 2; + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + const float* k0 = pA + (p + 0) * K; + const float* k1 = pA + (p + 1) * K; + const float* k2 = pA + (p + 2) * K; + const float* k3 = pA + (p + 3) * K; + + float* ktmp = pA_t + (p / 8 + (p % 8) / 4) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp[1] = k1[0]; + ktmp[2] = k2[0]; + ktmp[3] = k3[0]; + ktmp += 4; + + k0 += 1; + k1 += 1; + k2 += 1; + k3 += 1; + } + } + + remain_outch_start += nn_outch << 2; + + for (int p = remain_outch_start; p < M; p++) + { + const float* k0 = pA + (p + 0) * K; + + float* ktmp = pA_t + (p / 8 + (p % 8) / 4 + p % 4) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp++; + k0++; + } + } +} + +void QuantTool::gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) +{ + float* scale_list = (float *)sys_malloc(elem_channel * 4); + int* zero_point_list = (int *)sys_malloc(elem_channel * 4); + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = 0; + } + + float scale = 1; + int zero_point = 0; + for (int c = 0; c < elem_channel; c++) + { + float weight_max = *std::max_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); + float weight_min = *std::min_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); + if (symmetry == 1) + { + if (abs(weight_max) > abs(weight_min)) + scale = abs(weight_max)/fake_quant_max; + else + scale = abs(weight_min)/fake_quant_max; + zero_point = 0; + } + else + { + scale = (weight_max - weight_min)/fake_quant_max; + zero_point = int(- weight_min / scale); + } + + scale_list[c] = scale; + zero_point_list[c] = zero_point; + } + + weight_tensor->scale_list = scale_list; + weight_tensor->zp_list = zero_point_list; +} + +void QuantTool::bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, + float* data, int elem_num, int elem_channel) +{ + int elem_perchannel = elem_num / elem_channel; + float* scale_list = (float *)sys_malloc(elem_channel * 4); + + for (int c = 0; c < elem_channel; c++) + { + float input_scale = input_tensor->scale; + float weight_scale = weight_tensor->scale_list[c]; + float bias_scale = input_scale * weight_scale; + scale_list[c] = bias_scale; + } + + bias_tensor->scale_list = scale_list; + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i*elem_perchannel + j; + if (scale_list[i] == 0) + { + data[data_idx] = 0; + } + else + { + data[data_idx] = round(data[data_idx] / scale_list[i]); + data[data_idx] = data[data_idx] * scale_list[i]; + } + } + } + +} + + +void QuantTool::weight_bias_reset() +{ + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + std::memcpy(this->weight_data_fake_quant, this->weight_data_fp32, this->weight_size); + std::memcpy(this->interleave_buffer_fake_quant, this->interleave_buffer_fp32, this->interleave_size_fake); + if (this->node_fake_quant->ir_node->input_num > 2) + { + memcpy(this->bias_data_fake_quant, this->bias_data_fp32, this->bias_size); + } + } +} + +void QuantTool::free_used_layers(int idx) +{ +// printf("#### free 0 idx %d\n",idx); + if (this->dict_free[idx].size() > 0) + { +// printf("#### free 1 idx %d\n",idx); + std::vector > freen_fp32; + std::vector > freen_fake_quant; + for (int fi = 0; fi < this->dict_free[idx].size(); fi++) + { + if (this->dict_free[idx][fi] != 0) + { +// printf("---free---\n"); + this->fp32_out[this->dict_free[idx][fi] ].clear(); + this->fake_quant_out[this->dict_free[idx][fi] ].clear(); + } + } + } +} + + +void QuantTool::load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc) +{ + std::unordered_map layer_scale; + std::unordered_map layer_zeropoint; + bool parse_from_file = false; + if (nullptr != scale_file) + { + std::ifstream scales(scale_file); + std::string line; + while (std::getline(scales, line)) + { + std::string layer_name; + float scale_val = 0.f; + float zero_point = 0.f; + size_t last = 0; + size_t index = line.find_first_of(" ", last); + size_t idx = line.find_last_of(" ", line.size()); + layer_name = line.substr(last, index - last); + // printf("layer_name : %s \n", layer_name.c_str()); + last = index + 1; + scale_val = atof((line.substr(last, line.size() - last)).c_str()); + zero_point = atof((line.substr(idx + 1, line.size())).c_str()); + + layer_scale[layer_name] = scale_val; + layer_zeropoint[layer_name] = zero_point; + // fprintf(stderr, "quant value : %s %f %f \n", layer_name.c_str(), scale_val, zero_point); + } + } + + std::unordered_map layer_used; + for (int i = 0; i < graphn->node_num; i++) + { + struct node* noden = graphn->node_list[i]; + for (int j = 0; j < noden->input_num; j++) + { + std::string layern = graphn->tensor_list[noden->input_tensors[j]]->name; + layer_used[layern]++; + } + } + + if (mode_sc == 0) + { + for (int i = 0; i < graphn->tensor_num; i++) + { + struct tensor* t = graphn->tensor_list[i]; + if (t->tensor_type == 1 || t->tensor_type == 3) + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + } + } + } + else + { + std::unordered_map layer_pass; + for (int i = graphn->tensor_num - 1; i >= 0; i--) + { + struct tensor* t = graphn->tensor_list[i]; + if (t->tensor_type == 1 || t->tensor_type == 3) + { + if (layer_pass[t->name] == 0) + { + uint32_t ir_node_idx = t->producer; + struct node* t_node = graphn->node_list[ir_node_idx]; + + auto op_name = t_node->op.type; + + bool poolTrue = false; + bool reluTrue = false; + if (op_name == OP_POOL) + { + struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == OP_RELU) + { + struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + + if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || + poolTrue || reluTrue) + { + struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + + if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) + { + this->recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, + layer_zeropoint, layer_pass); + } + } + } + else + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + } + layer_pass[t->name] = 1; + } + } + } + } + + // for (int i = 0; i < graphn->tensor_num; i++) + // { + // struct ir_tensor* t = graphn->tensor_list[i]; + // if (t->tensor_type == 1 || t->tensor_type == 3) + // { + // printf(" sz %s %f %d \n",t->name, t->scale, t->zero_point); + // } + // } +} + + +int QuantTool::get_exec_node_message(int exec_node_idx) +{ + /* get node */ + this->node_fp32 = ( struct exec_node* )get_vector_data(this->exec_graph_fp32->exec_node_list, exec_node_idx); + this->node_fake_quant = ( struct exec_node* )get_vector_data(this->exec_graph_fake_quant->exec_node_list, exec_node_idx); + + /* get op type */ + this->op_name = this->node_fp32->ir_node->op.type; + + /* get exec ops */ + this->node_ops_fp32 = this->node_fp32->node_ops; + this->node_ops_fake_quant = this->node_fake_quant->node_ops; + + /* handle the shape changed and dynamic shape case */ + if (this->node_ops_fp32->reshape && this->node_ops_fp32->reshape(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32) + && this->node_ops_fake_quant->reshape && this->node_ops_fake_quant->reshape(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant) < 0) + { + TLOG_ERR("failed to reshape node %d, %s\n", node_fp32->ir_node->index, node_fp32->ir_node->name); + return -1; + } + + /* get output tensor */ + this->output_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->output_tensors[0]]; + this->output_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->output_tensors[0]]; + + /* get exec ops */ + this->execidx_elemnum[exec_node_idx] = this->output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[exec_node_idx] = this->output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[exec_node_idx] = this->output_tensor_fp32->name; //exec idx --> output tensor name + + return 0; +} + +void QuantTool::cosin_similarity(std::vector &cosin, std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel) // cosin dis perchannel +{ +// fprintf(stderr, " in_a %f ",in_a[0][0]); +// fprintf(stderr, " in_b %f ",in_b[0][0]); + + std::vector norm_a(output_channel, 0.0); + std::vector norm_b(output_channel, 0.0); + std::vector a_b(output_channel, 0.0); + + int elem_perchannel = int(output_num / output_channel); + + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < output_channel; j++) + { + for (int k = 0; k < elem_perchannel; k++) + { + int elem_idx = j * elem_perchannel + k; + norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; + norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; + a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; + + } + } + } + + cosin.resize(output_channel); + for (int j = 0; j < output_channel; j++) + { + double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); +// fprintf(stderr, " %lf %f %f \n ", _a_b_, sqrt(norm_a[j]), sqrt(norm_b[j]) ); + if(_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin[j] = a_b[j]; + else + cosin[j] = a_b[j]/_a_b_; + if (cosin[j] < -999999 || cosin[j] > 999999) + cosin[j] = 0; + } +} + +int QuantTool::assess_quant_loss(int gen) +{ + this->init(); + for (int i = 0; i < this->exec_node_num; i++) + { + this->get_exec_node_message(i); + this->check_for_interlearve(); + + this->out_imgs_fp32.resize(this->max_search_img_num); + this->out_imgs_fake_quant.resize(this->max_search_img_num); + if (this->op_name == OP_CONV || this->op_name == OP_FC) + this->weight_bias_requant(gen); + + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, 0); + + /* op run */ + this->node_ops_fp32->run(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32); + this->node_ops_fake_quant->run(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant); + this->activation_requant(this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num, 8, 1, this->output_tensor_fake_quant->scale, this->output_tensor_fake_quant->zero_point); + } + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1) ) + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], this->weight_tensor_fp32->dims[0]); + else + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->print_cosin(this->cosin.data(), i, this->weight_tensor_fp32->dims[0]); + else + this->print_cosin(this->cosin.data(), i, 1); +// fprintf(stderr, "cosin [%s] : %f\n", execidx_nodename[i].c_str(), cosin); + + this->weight_bias_reset(); + this->free_used_layers(i); + + /* save node output */ + this->fp32_out.push_back(this->out_imgs_fp32); + this->fake_quant_out.push_back(this->out_imgs_fake_quant); + } + + return 0; +} + +void QuantTool::print_cosin(double* cosin, int idx, int output_channel) +{ + float avg_cosin = 0; + float avg_num = 0; + for (int c = 0; c < output_channel; c++) + { + if (cosin[c] != 0) + { + avg_cosin += cosin[c]; + avg_num ++; + } + } + fprintf(stderr, "cosin %3d %4d avg %0.6f ### ", idx, output_channel, avg_cosin/avg_num); + for (int c = 0; c < output_channel; c++) + { + fprintf(stderr, "%0.6f ",cosin[c]); + } + fprintf(stderr, "\n"); +} + +void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom) +{ + float* scale_list = (float *)weight_tensor->scale_list; + int* zero_point_list = (int *)weight_tensor->zp_list; + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = 0; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + float scale = scale_list[i] * zoom; + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i*elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; + } + } + } + +} +void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom) +{ + float* scale_list = (float *)weight_tensor->scale_list; + int* zero_point_list = (int *)weight_tensor->zp_list; + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2,bitcount-symmetry) - 1; + fake_quant_min = 0; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + float scale = 1; + if (zoom[i] > 5) + scale = scale_list[i]; + else + scale = scale_list[i] * zoom[i]; + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i*elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; + } + } + } + +} + + +int QuantTool::quant_search() +{ + this->init(); + for (int i = 0; i < this->exec_node_num; i++) + { + this->get_exec_node_message(i); + this->check_for_interlearve(); + + this->out_imgs_fp32.resize(this->max_search_img_num); + this->out_imgs_fake_quant.resize(this->max_search_img_num); + + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + this->gen_weight_scale(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0]); + this->gen_weight_scale(this->weight_tensor_fp32, this->weight_data_fp32, this->weight_tensor_fp32->elem_num, 8, 1, weight_tensor_fp32->dims[0]); + + std::vector cosin_save(weight_tensor_fake_quant->dims[0], -1); + std::vector zoom_save(weight_tensor_fake_quant->dims[0], -1); + for (int snum = 0; snum < 201; snum=snum + 20) + { + float zoom = 1.3 / 200 * (snum + 1); +// float zoom = 1.0; + /* weight requant */ + if (snum < 200) + this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom); + else + { + this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom_save.data()); + float* buf = (float*)sys_malloc(weight_tensor_fake_quant->dims[0] * 4); + memcpy(buf, zoom_save.data(), weight_tensor_fake_quant->dims[0] * 4); +// printf(" scale3 %f \n",weight_tensor_fp32->scale_list[0]); + for ( int bi = 0; bi < weight_tensor_fake_quant->dims[0]; bi++ ) + { + buf[bi] *= weight_tensor_fp32->scale_list[bi]; + } +// printf(" scale4 %f \n",buf[0]); +// weight_tensor_fake_quant->scale_list = buf; + weight_tensor_fp32->scale_list = buf; + weight_tensor_fp32->quant_param_num = weight_tensor_fp32->dims[0]; +// printf(" scale5 %f \n",weight_tensor_fp32->scale_list[0]); + } + if (interleave_size_fake != 0) + { + int M = weight_tensor_fake_quant->dims[0]; + int K = weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; + this->conv_hcl_interleave_pack4_fp32(M, K, weight_data_fake_quant, interleave_buffer_fake_quant); + } + + /* bias requant */ + if (node_fake_quant->ir_node->input_num > 2) + { + struct tensor* input_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[0]]; + struct tensor* bias_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[2]]; + struct tensor* bias_tensor_fp32 = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[2]]; + + bias_size = bias_tensor_fp32->elem_num * bias_tensor_fp32->elem_size; + + bias_data_fp32 = (float*)bias_tensor_fp32->data; + bias_data_fake_quant = (float*)bias_tensor_fake_quant->data; + + this->bias_requant(input_tensor_fake_quant, weight_tensor_fake_quant, bias_tensor_fake_quant, + bias_data_fake_quant, bias_tensor_fake_quant->elem_num, bias_tensor_fake_quant->dims[0]); + } + + /* per image run */ + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, snum); + + /* FP32 op run */ + if (snum == 0) + { +// set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); + node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); + + this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[i] = output_tensor_fp32->name; + } + + /* fake quant op run */ +// set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); + node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); + this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); + } // image number + + output_channel = output_tensor_fp32->dims[1]; + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1) ) + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); + else + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + +// this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); + + for (int cosi = 0; cosi < output_channel; cosi++) + { + if (cosin[cosi] > cosin_save[cosi]) + { + cosin_save[cosi] = cosin[cosi]; + zoom_save[cosi] = zoom; + } + } + if (snum == 200) + { + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->print_cosin(this->cosin.data(), i, output_channel); + else + this->print_cosin(this->cosin.data(), i, 1); + } + + if (op_name == OP_CONV || op_name == OP_FC) + { + memcpy(weight_data_fake_quant, weight_data_fp32, weight_size); +// this->weight_correction(weight_data_fp32, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, this->bitcount, this->symmetry, weight_tensor_fake_quant->dims[0]); + memcpy(interleave_buffer_fake_quant, interleave_buffer_fp32, interleave_size_fake); + if (node_fake_quant->ir_node->input_num > 2) + { + memcpy(bias_data_fake_quant, bias_data_fp32, bias_size); + } + } + } + } + else + { + /* per image run */ + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, 0); + +// set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); + node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); + + /* fake quant op run */ +// set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); + node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); + this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); + + this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[i] = output_tensor_fp32->name; + } + this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + this->print_cosin(this->cosin.data(), i, 1); + this->execidx_loss[i] = cosin; + } + + this->free_used_layers(i); + + /* save node output */ + this->fp32_out.push_back(this->out_imgs_fp32); + this->fake_quant_out.push_back(this->out_imgs_fake_quant); + } // node number +// fprintf(stderr, "--------------------------------------\n"); + + if (!save_graph(graphn_fp32, "save_i8_eq.tmfile")) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + return 0; +} + diff --git a/tools/quantize/quant_save_graph.cpp b/tools/quantize/quant_save_graph.cpp index 410db6be3..0ff72f180 100644 --- a/tools/quantize/quant_save_graph.cpp +++ b/tools/quantize/quant_save_graph.cpp @@ -505,6 +505,11 @@ int save_graph_i8_perchannel(const char* model_file, const char* scale_file, con if (internal) { // TODO + for (int ch = 0; ch < channel_num; ch++) + { + weight_scale_list[ch] = weight_tensor->scale_list[ch]; + weight_zp_list[ch] = 0; + } } else { diff --git a/tools/quantize/quant_tool.hpp b/tools/quantize/quant_tool.hpp index cc8d9ae13..c413eaad0 100644 --- a/tools/quantize/quant_tool.hpp +++ b/tools/quantize/quant_tool.hpp @@ -22,9 +22,14 @@ * Author: hhchen@openailab.com */ +#pragma once + #include #include #include +#include +#include +#include extern "C" { #include "api/c_api.h" @@ -34,11 +39,41 @@ extern "C" { #include "graph/tensor.h" #include "utility/sys_port.h" #include "utility/utils.h" +#include "utility/log.h" +#include "utility/vector.h" + +#include "../source/device/cpu/cpu_node.h" +#include "../source/device/cpu/cpu_graph.h" + +#include "convolution_param.h" +#include "fc_param.h" +#include "pooling_param.h" +#include "relu_param.h" } +#include "quant_utils.hpp" +#include "quant_save_graph.hpp" + +typedef std::unordered_map dict_str2int; +typedef std::unordered_map dict_str2float; +typedef std::unordered_map dict_uint2uint; +typedef std::unordered_map > dict_uint2vecuint; +typedef std::unordered_map dict_uint2str; +typedef std::unordered_map > dict_uint2doublex; + + #define ALGORITHM_MIN_MAX 0 #define ALGORITHM_KL 1 #define ALGORITHM_ACIQ 2 +#define ALGORITHM_DFQ 3 +#define ALGORITHM_MM_EQ 4 + +struct node_graph +{ + int pass; + std::vector input_node_list; + std::vector output_node_list; +}; class QuantTool { @@ -46,7 +81,41 @@ class QuantTool QuantTool(); ~QuantTool(); + int init(); int activation_quant_tool(); + int assess_quant_loss(int gen); + int quant_search(); + int data_free_quant(); + +private: + void recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, + dict_str2int &layer_used, dict_str2float &layer_scale, + dict_str2float &layer_zeropoint, dict_str2int &layer_pass); + + struct exec_graph* get_exec_graph(struct graph* graphn); + void load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc); + int prerun_for_get_ir_tensor(void* graph, struct options opt); + void check_for_free(); + + void check_for_interlearve(); + void weight_bias_requant(int search); + void conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t); + void activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point=0); + void weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel); + void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom); + void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom); + void bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, + float* data, int elem_num, int elem_channel); + void set_node_input_output_tensor(int idx, int imgi, int snum); + double cosin_similarity(std::vector* in_a,std::vector* in_b, uint32_t imgs_num, uint32_t output_num); + double cosin_similarity(std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num); + void cosin_similarity(std::vector &cosin, std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel); // cosin dis perchannel + void weight_bias_reset(); + void free_used_layers(int idx); + void gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel); + int get_exec_node_message(int exec_node_idx); + + void print_cosin(double* cosin, int idx, int output_channel); public: struct options opt; @@ -61,13 +130,81 @@ class QuantTool int img_c; int img_h; int img_w; - float mean[3]; // value of mean (mean value, default is 104.0,117.0,123.0) - float scale[3]; // value of normalize (scale value, default is 1.0,1.0,1.0) - int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0) + float mean[3]; // value of mean (mean value, default is 104.0,117.0,123.0) + float scale[3]; // value of normalize (scale value, default is 1.0,1.0,1.0) + int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0) int letterbox_rows; int letterbox_cols; int sw_RGB; // flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1) int focus; // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0) int inplace; // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip int algorithm_type; // the type of quant algorithm(0:min-max, 1:kl, default is 0) + bool evaluate; // evaluate quantitative losses + +private: // system variable + dict_uint2uint ir_exec; + dict_uint2uint exec_ir; + dict_uint2vecuint dict_free; + dict_uint2uint execidx_elemnum; + dict_uint2uint execidx_elemsize; + dict_uint2str execidx_nodename; + dict_uint2doublex execidx_loss; + + int max_search_img_num; + + std::vector cosin; + +private: // basic message + int img_size; + double cosin_max; + float scale_acc; + +private: // ir graph variable + std::vector > > fp32_out; + std::vector > > fake_quant_out; + std::vector > input_datas_fp32; + std::vector > input_datas_fake_quant; + std::vector > out_imgs_fp32; + std::vector > out_imgs_fake_quant; + + struct graph* graphn_fp32; + struct graph* graphn_fake_quant; + struct tensor* graph_input_tensor_fp32; + struct tensor* graph_input_tensor_fake_quant; + struct exec_graph* exec_graph_fp32; + struct exec_graph* exec_graph_fake_quant; + int exec_node_num; + +private: // temp variable + uint16_t op_name; + + struct exec_node* node_fp32; + struct exec_node* node_fake_quant; + struct node_ops* node_ops_fp32; + struct node_ops* node_ops_fake_quant; + + struct tensor* input_tensor_fp32; + struct tensor* input_tensor_fake_quant; + struct tensor* weight_tensor_fp32; + struct tensor* weight_tensor_fake_quant; + struct tensor* bias_tensor_fp32; + struct tensor* bias_tensor_fake_quant; + struct tensor* output_tensor_fp32; + struct tensor* output_tensor_fake_quant; + + float* weight_data_fp32; + float* weight_data_fake_quant; + uint32_t weight_size; + float* interleave_buffer_fp32; + float* interleave_buffer_fake_quant; + uint32_t interleave_size_fake; + float* bias_data_fp32; + float* bias_data_fake_quant; + uint32_t bias_size; + uint32_t output_channel; + + struct conv_priv_info* conv_priv_info_fp32; + struct conv_priv_info* conv_priv_info_fake_quant; + struct conv_param* conv_param_fp32; + struct conv_param* conv_param_fake_quant; }; diff --git a/tools/quantize/quant_tool_int8.cpp b/tools/quantize/quant_tool_int8.cpp index 4e318d9e4..859840b50 100644 --- a/tools/quantize/quant_tool_int8.cpp +++ b/tools/quantize/quant_tool_int8.cpp @@ -66,6 +66,7 @@ QuantTool::QuantTool() this->focus = 0; this->inplace = true; this->algorithm_type = ALGORITHM_MIN_MAX; + this->evaluate = false; } QuantTool::~QuantTool() @@ -163,6 +164,7 @@ int QuantTool::activation_quant_tool() /* init minmax */ std::unordered_map max_activation; std::unordered_map min_activation; + std::unordered_map act_map; uint32_t act_tensor_num = 0; for (int i = 0; i < ir_graph->tensor_num; i++) { @@ -172,6 +174,7 @@ int QuantTool::activation_quant_tool() act_tensor_num++; max_activation[i] = -FLT_MAX; min_activation[i] = FLT_MAX; + act_map[act_tensor_num-1] = i; } } @@ -213,10 +216,134 @@ int QuantTool::activation_quant_tool() } } } + fprintf(stderr, "\n"); if (this->algorithm_type == ALGORITHM_KL) { - /* todo support */ - fprintf(stderr, "\r\n[****WARNING****]:Step 2 find original calibration kl threshold table NOT support temporarily!\n"); + /* kl process divergence */ + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n"); + std::unordered_map tensor_hist; + std::unordered_map hist_tensor; + std::vector > hist_edge; + std::vector > hist_gram; + + /* second loop, create histgram */ + for (int nums = imgs_list.size() - 1; nums >= 0; nums--) + { + fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num); + + get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + /* run graph */ + if (run_graph(ir_graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + + /* calculate hist */ + uint32_t inum = 0; + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + float step_max = std::abs(max_activation[i]); + if (std::abs(min_activation[i]) > step_max) + step_max = std::abs(min_activation[i]); + float step_bin = step_max / 2048.0f; + + std::vector every_edge; + if (nums == imgs_list.size() - 1) + { + for (int j = 0; j < 2048; j++) + { + float edge_float = (step_bin * (j + 0.5f)); + every_edge.push_back(edge_float); + } + hist_edge.push_back(every_edge); + hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, step_max)); + } + else + { + std::vector hist_tmp; + hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, step_max); + for (int j = 0; j < 2048; j++) + { + hist_gram[inum][j] += hist_tmp[j]; + } + } + + tensor_hist[i] = inum; + hist_tensor[inum] = i; + inum++; + } + } + } + + fprintf(stderr, "\n"); + + /* save the calibration file with min-max algorithm with kl divergence */ + int fake_quant_set = 127; + FILE* fp_kl = fopen("table_kl.scale", "wb"); + for (int i = 0; i < act_tensor_num; i++) + { + struct tensor* t = ir_graph->tensor_list[act_map[i]]; + int threshold_bin = threshold_distribution(hist_gram[i], fake_quant_set + 1); + fprintf(stderr, " threshold_bin %d \n", threshold_bin); + + float act_scale = hist_edge[i][threshold_bin] / fake_quant_set; + int act_zero_point = 0; + + /* the scale of softmax always is scale = 1 / 127.f */ + for (int j = 0; j < ir_graph->node_num; j++) + { + struct node* noden = ir_graph->node_list[j]; + struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + + if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR)) + continue; + + std::string tmp_op_name = get_op_name_from_type(noden->op.type); + std::string cur_name = t->name; + std::string tmp_name = tensor_tmp->name; + + if ((cur_name == tmp_name) && tmp_op_name == "Softmax") + { + act_scale = 1 / 127.f; + act_zero_point = 0; + break; + } + } + + /* the scale of eltwise */ + for (int j = 0; j < ir_graph->node_num; j++) + { + struct node* noden = ir_graph->node_list[j]; + std::string tmp_op_name = get_op_name_from_type(noden->op.type); + if (tmp_op_name == "Eltwise") + { + struct tensor* tensor_in0 = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); + struct tensor* tensor_in1 = get_ir_graph_tensor(ir_graph, noden->input_tensors[1]); + struct tensor* tensor_out = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + + std::string cur_name = t->name; + std::string tmp_name0 = tensor_in0->name; + std::string tmp_name1 = tensor_in1->name; + + if ((cur_name == tmp_name0 || cur_name == tmp_name1)) + { + act_scale = tensor_out->scale; + break; + } + } + } + + t->scale = act_scale; + t->zero_point = 0; + fprintf(fp_kl, "%s %f %d\n", t->name, act_scale, act_zero_point); + } + fclose(fp_kl); + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n"); } else if (this->algorithm_type == ALGORITHM_ACIQ) { @@ -304,7 +431,7 @@ int QuantTool::activation_quant_tool() fprintf(stderr, "\r\n[Quant Tools Info]: Step 2, find original calibration minmax threshold table done, output ./table_minmax.scale\n"); } - fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); +// fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); /* release tengine */ postrun_graph(ir_graph); @@ -343,7 +470,7 @@ int main(int argc, char* argv[]) QuantTool quant_tool; int res; - while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1) + while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:z:t:h")) != -1) { switch (res) { @@ -390,6 +517,9 @@ int main(int argc, char* argv[]) case 'k': quant_tool.focus = atoi(optarg); break; + case 'z': + quant_tool.evaluate = atoi(optarg); + break; case 't': quant_tool.num_thread = atoi(optarg); quant_tool.opt.num_thread = atoi(optarg); @@ -444,35 +574,102 @@ int main(int argc, char* argv[]) fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF"); fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); - /* using 3rd calibration table file */ - if (quant_tool.scale_file.empty()) + + switch(quant_tool.algorithm_type) { - /* select algorithm */ - if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX) + case ALGORITHM_MIN_MAX: + { + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); + } + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_KL: + { + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_kl.scale"; + quant_tool.activation_quant_tool(); + } + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_ACIQ: { - quant_tool.scale_file = "table_minmax.scale"; + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_aciq.scale"; + quant_tool.activation_quant_tool(); + } + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; } - else if (quant_tool.algorithm_type == ALGORITHM_KL) + case ALGORITHM_DFQ: { - quant_tool.scale_file = "table_kl.scale"; + quant_tool.data_free_quant(); + quant_tool.model_file = "test_dfq_fp32.tmfile"; + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); + } + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; } - else if (quant_tool.algorithm_type == ALGORITHM_ACIQ) + case ALGORITHM_MM_EQ: { - quant_tool.scale_file = "table_aciq.scale"; + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); + } + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + /* Enable EQ search */ + fprintf(stderr, "[Quant Tools Info]: Step Search, enable EQ search\n"); + quant_tool.quant_search(); + quant_tool.model_file = "save_i8_eq.tmfile"; + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, true); + break; } - else + default: { - fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); - quant_tool.scale_file = "table_minmax.scale"; + fprintf(stderr,"Unsupported quantization type ... \n"); + break; } - - /* quantize activation */ - quant_tool.activation_quant_tool(); } - /* quantize weight/bias and save into int8 tmfile */ - fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n"); diff --git a/tools/quantize/quant_utils.cpp b/tools/quantize/quant_utils.cpp index 0f60d3838..c8265332e 100644 --- a/tools/quantize/quant_utils.cpp +++ b/tools/quantize/quant_utils.cpp @@ -77,7 +77,7 @@ void split(float* array, char* str, const char* del) } void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean, - const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0) + const float* scale, int sw_RGB = 1, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0) { /* only for yolov5s */ if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0) @@ -411,6 +411,22 @@ std::vector histCount(float* data, uint32_t elem_num, float max_val, f return hist; } +std::vector histCount(float* data, uint32_t elem_num, float abs_max) +{ + float bin_scale = abs_max / 2047.f; + int bin_zp = 0; + std::vector hist(2048); + for (int i = 0; i < elem_num; i++) + { + if (data[i] != 0) + { + uint32_t hist_idx = round(std::abs(data[i]) / bin_scale); + hist[hist_idx] ++; + } + } + return hist; +} + float compute_kl_divergence(std::vector& dist_a, std::vector& dist_b) { const size_t length = dist_a.size(); diff --git a/tools/quantize/quant_utils.hpp b/tools/quantize/quant_utils.hpp index 4ad636763..df529181a 100644 --- a/tools/quantize/quant_utils.hpp +++ b/tools/quantize/quant_utils.hpp @@ -40,6 +40,7 @@ void get_input_data_cv(const char* image_file, float* input_data, int img_c, int void readFileList(std::string basePath, std::vector& imgs); std::vector histCount(float* data, uint32_t elem_num, float max_val, float min_val); +std::vector histCount(float* data, uint32_t elem_num, float abs_max); float compute_kl_divergence(std::vector& dist_a, std::vector& dist_b); From 120adf983dce317d48da5b868de3c437dac41a34 Mon Sep 17 00:00:00 2001 From: BowShotDS Date: Thu, 2 Sep 2021 08:50:50 +0000 Subject: [PATCH 2/2] apply code-format changes --- tools/quantize/algorithm/quant_dfq.cpp | 1148 ++++++----- tools/quantize/algorithm/quant_eq.cpp | 2580 ++++++++++++------------ tools/quantize/quant_tool.hpp | 37 +- tools/quantize/quant_tool_int8.cpp | 168 +- tools/quantize/quant_utils.cpp | 2 +- 5 files changed, 1957 insertions(+), 1978 deletions(-) diff --git a/tools/quantize/algorithm/quant_dfq.cpp b/tools/quantize/algorithm/quant_dfq.cpp index 66be8df81..7b17ddbbf 100644 --- a/tools/quantize/algorithm/quant_dfq.cpp +++ b/tools/quantize/algorithm/quant_dfq.cpp @@ -1,576 +1,572 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ - -#include "../quant_tool.hpp" - -//int QuantTool::data_free_quant(const char* model_file, const char* image_dir, -// int img_c, int img_h, int img_w, const float* mean, const float* scale, -// int num_thread, int sw_RGB, int center_crop) -int QuantTool::data_free_quant() -{ - int letterbox = 0; - int loop_count = 1; - const char* image_file = nullptr; - - - /* set runtime options */ - struct options opt; - opt.num_thread = num_thread; - opt.cluster = TENGINE_CLUSTER_ALL; - opt.precision = TENGINE_MODE_FP32; - -// /* inital tengine */ -// if (init_tengine() != 0) -// { -// fprintf(stderr, "Initial tengine failed.\n"); -// return -1; -// } -// fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); - - /* create graph, load tengine model xxx.tmfile */ - graph_t graph = create_graph(NULL, "tengine", model_file.c_str()); - if (NULL == graph) - { - fprintf(stderr, "Create graph failed.\n"); - fprintf(stderr, "errno: %d \n", get_tengine_errno()); - return -1; - } - - struct graph* graphn = (struct graph*)graph; - struct node_graph* node_proto = ( struct node_graph* )sys_malloc( sizeof(struct node_graph) * graphn->node_num); - - for (int i = 0; i < graphn->node_num; i++) - { - struct node* n = graphn->node_list[i]; //ir node - const uint16_t node_idx = n->index; //node idx - auto op_type = n->op.type; - const char* layer_name = n->name; //layer name - - const uint16_t input_num = n->input_num; //input num - const uint16_t output_num = n->output_num; //output num - - node_proto[i].pass = 0; -// node_proto[i].input_node_list = create_vector(sizeof(uint16_t), NULL); -// node_proto[i].output_node_list = create_vector(sizeof(uint16_t), NULL); - - for (int j = 0; j < input_num; j++) - { - struct tensor* input_tensor = get_ir_graph_tensor(graphn, n->input_tensors[j]); - const char* input_tensor_name = input_tensor->name; - uint8_t dim_num = input_tensor->dim_num; - - if (input_tensor->producer >= 0) - { - struct node* node = graphn->node_list[input_tensor->producer]; - node_proto[i].input_node_list.push_back(node->index); - node_proto[node->index].output_node_list.push_back(i); - } - if (OP_CONV == op_type || OP_FC == op_type) - { - break; - } - } - } - - for (int i = 0; i < graphn->node_num; i++) - { - struct node* n = graphn->node_list[i]; //ir node - const uint16_t node_idx = n->index; //node idx - auto op_type = n->op.type; - const char* layer_name = n->name; //layer name - if (op_type != NULL) - { - if (OP_CONV != op_type && OP_FC != op_type) - { - if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) - { - uint16_t node_input_id = node_proto[i].input_node_list[0]; - uint16_t node_output_id = node_proto[i].output_node_list[0]; - if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1) - { - node_proto[i].input_node_list.erase(node_proto[i].input_node_list.begin() + 0); - node_proto[i].output_node_list.erase(node_proto[i].output_node_list.begin() + 0); - - node_proto[node_input_id].output_node_list.erase(node_proto[node_input_id].output_node_list.begin() + 0); - node_proto[node_input_id].output_node_list.push_back(node_output_id); - - node_proto[node_output_id].input_node_list.erase(node_proto[node_output_id].input_node_list.begin() + 0); - node_proto[node_output_id].input_node_list.push_back(node_input_id); - } - } - } - } - } - - for (int i = 0; i < graphn->node_num; i++) - { - struct node* n = graphn->node_list[i]; //ir node - const uint16_t node_idx = n->index; //node idx - auto op_name = n->op.type; - const char* layer_name = n->name; //layer name - - const uint16_t input_num = n->input_num; //input num - const uint16_t output_num = n->output_num; //output num - - if (op_name != NULL) - { - if (OP_CONV == op_name) - { - // DW_Conv && Direct_Conv - struct conv_param* conv_param = ( struct conv_param* )n->op.param_mem; - if (conv_param->group == conv_param->output_channel) - { -// printf(" #### DW Conv ####\n"); - if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) - { - uint16_t node_input_id = node_proto[i].input_node_list[0]; - uint16_t node_output_id = node_proto[i].output_node_list[0]; - auto op_name0 = graphn->node_list[node_input_id]->op.type; - auto op_name2 = graphn->node_list[node_input_id]->op.type; - - if (node_proto[node_input_id].output_node_list.size() == 1 && - node_proto[node_output_id].input_node_list.size() == 1 && - OP_CONV == op_name0 && OP_CONV == op_name2) - { - node_proto[i].pass = 1; //layer1 - node_proto[node_input_id].pass = 1; //layer0 - node_proto[node_output_id].pass = 1; //layer2 - - // layer0 min/max range - struct node* nodeP = graphn->node_list[node_input_id]; - struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - uint16_t dims0 = input_tensor->dims[0]; - uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - - std::vector layer0_max(dims0, 0.0f); - std::vector layer0_min(dims0, 0.0f); - std::vector layer0_range(dims0, 0.0f); - - float* data_layer0 = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims123; d1++) - { - if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) - layer0_max[d0] = data_layer0[dims123 * d0 + d1]; - if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) - layer0_min[d0] = data_layer0[dims123 * d0 + d1]; - } - } -// printf("### %d ###\n",dims0); - for (int d0 = 0; d0 < dims0; d0++) - { - layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; - } - - // layer1 min/max range - nodeP = graphn->node_list[i]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - - std::vector layer1_max(dims0, 0.0f); - std::vector layer1_min(dims0, 0.0f); - std::vector layer1_range(dims0, 0.0f); - - float* data_layer1 = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims123; d1++) - { - if (data_layer1[dims123 * d0 + d1] > layer1_max[d0]) - layer1_max[d0] = data_layer1[dims123 * d0 + d1]; - if (data_layer1[dims123 * d0 + d1] < layer1_max[d0]) - layer1_min[d0] = data_layer1[dims123 * d0 + d1]; - } - } -// printf("### %d ###\n",dims0); - for (int d0 = 0; d0 < dims0; d0++) - { - layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; - } - - // layer2 min/max range - nodeP = graphn->node_list[node_output_id]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - uint16_t dims1 = input_tensor->dims[1]; - uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; - - std::vector layer2_max(dims0, 0.0f); - std::vector layer2_min(dims0, 0.0f); - std::vector layer2_range(dims0, 0.0f); - - float* data_layer2 = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims1; d1++) - { - for (int d2 = 0; d2 < dims23; d2++) - { - if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer2_max[d1]) - { - layer2_max[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; - } - if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer2_min[d1]) - { - layer2_min[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; - } - } - } - } -// printf("### %d ###\n",dims1); - for (int d1 = 0; d1 < dims1; d1++) - { - layer2_range[d1] = layer2_max[d1] - layer2_min[d1]; - } - -////////////////////////////////////////////////////////////////////////////////// - - // layer ops sqrt - float ops_range[dims1]; - for (int ops = 0; ops < dims1; ops++) - { - ops_range[ops] = pow(layer0_range[ops] * layer1_range[ops] * layer2_range[ops], 1.0/3); - } - - float S01[dims1]; - float S01_F[dims1]; - float S12[dims1]; - float S12_F[dims1]; - for (int ops = 0; ops < dims1; ops++) - { - if (ops_range[ops] == 0) - { - S01[ops] = 0.0; - S12_F[ops] = 0.0; - } - else - { - S01[ops] = layer0_range[ops]/ops_range[ops]; - S12_F[ops] = layer2_range[ops]/ops_range[ops]; - } - if (layer0_range[ops] == 0) - S01_F[ops] = 0.0; - else - S01_F[ops] = ops_range[ops]/layer0_range[ops]; - if (layer2_range[ops] == 0) - S12[ops] = 0.0; - else - S12[ops] = ops_range[ops]/layer2_range[ops]; - } -////////////////////////////////////////////////////////////////////////////////// - - // layer0 output - nodeP = graphn->node_list[node_input_id]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims123; d1++) - { - data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; - } - } - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); - dims0 = input_tensor->dims[0]; - float* data_layer0_bias = (float *)sys_malloc(sizeof(float) * dims0); - data_layer0_bias = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; - } - - // layer1 output - nodeP = graphn->node_list[i]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims123; d1++) - { - data_layer1[dims123 * d0 + d1] = data_layer1[dims123 * d0 + d1] * S01[d0] * S12_F[d0]; - } - } - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); - dims0 = input_tensor->dims[0]; - float* data_layer1_bias = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - data_layer1_bias[d0] = data_layer1_bias[d0] * S12_F[d0]; - } - - // layer2 output - nodeP = graphn->node_list[node_output_id]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - dims1 = input_tensor->dims[1]; - dims23 = input_tensor->dims[2] * input_tensor->dims[3]; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims1; d1++) - { - for (int d2 = 0; d2 < dims23; d2++) - { - data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] * S12[d1]; - } - } - } - } - } - } - else - { -// printf(" #### Direct Conv ####\n"); - if (node_proto[i].pass == 0) - { - if (node_proto[i].input_node_list.size() == 1) - { - uint16_t node_input_id = node_proto[i].input_node_list[0]; - if (graphn->node_list[node_input_id]->input_num > 0) - { - auto op_name0 = graphn->node_list[node_input_id]->op.type; - - if (node_proto[node_input_id].output_node_list.size() == 1 && - op_name0 == OP_CONV) - { - node_proto[i].pass = 1; //layer1 - node_proto[node_input_id].pass = 1; //layer0 - - // layer0 min/max range - struct node* nodeP = graphn->node_list[node_input_id]; - struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - uint16_t dims0 = input_tensor->dims[0]; - uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - - std::vector layer0_max(dims0, 0.0f); - std::vector layer0_min(dims0, 0.0f); - std::vector layer0_range(dims0, 0.0f); - - float* data_layer0 = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims123; d1++) - { - if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) - layer0_max[d0] = data_layer0[dims123 * d0 + d1]; - if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) - layer0_min[d0] = data_layer0[dims123 * d0 + d1]; - } - } -// printf("### %d ###\n",dims0); - for (int d0 = 0; d0 < dims0; d0++) - { - layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; - } - - // layer1 min/max range - nodeP = graphn->node_list[i]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - uint16_t dims1 = input_tensor->dims[1]; - uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; - - std::vector layer1_max(dims0, 0.0f); - std::vector layer1_min(dims0, 0.0f); - std::vector layer1_range(dims0, 0.0f); - - float* data_layer1 = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims1; d1++) - { - for (int d2 = 0; d2 < dims23; d2++) - { - if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer1_max[d1]) - { - layer1_max[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; - } - if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer1_min[d1]) - { - layer1_min[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; - } - } - } - } -// printf("### %d ###\n",dims1); - for (int d0 = 0; d0 < dims1; d0++) - { - layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; - } - -////////////////////////////////////////////////////////////////////////////////// - - // layer ops sqrt - float ops_range[dims1]; - for (int ops = 0; ops < dims1; ops++) - { - ops_range[ops] = sqrt(layer0_range[ops] * layer1_range[ops]); - } - - float S01[dims1]; - float S01_F[dims1]; - for (int ops = 0; ops < dims1; ops++) - { - if (ops_range[ops] == 0) - { - S01[ops] = 0.0; - } - else - { - S01[ops] = layer0_range[ops]/ops_range[ops]; - } - if (layer0_range[ops] == 0) - S01_F[ops] = 0.0; - else - S01_F[ops] = ops_range[ops]/layer0_range[ops]; - } -////////////////////////////////////////////////////////////////////////////////// - // layer0 output - nodeP = graphn->node_list[node_input_id]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims123; d1++) - { - data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; - } - } - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); - dims0 = input_tensor->dims[0]; - float* data_layer0_bias = (float *)sys_malloc(sizeof(float) * dims0); - data_layer0_bias = (float*)input_tensor->data; - for (int d0 = 0; d0 < dims0; d0++) - { - data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; - } - - // layer1 output - nodeP = graphn->node_list[i]; - input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); - dims0 = input_tensor->dims[0]; - dims1 = input_tensor->dims[1]; - dims23 = input_tensor->dims[2] * input_tensor->dims[3]; - for (int d0 = 0; d0 < dims0; d0++) - { - for (int d1 = 0; d1 < dims1; d1++) - { - for (int d2 = 0; d2 < dims23; d2++) - { - data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] * S01[d1]; - } - } - } - } - } - } - } - } - } - } - } - - if (!save_graph(graph, "test_dfq_fp32.tmfile")) - { - fprintf(stderr, "save graph failed.\n"); - return -1; - } - - /* set the shape, data buffer of input_tensor of the graph */ - int img_size = img_h * img_w * img_c; - int dims[] = {1, img_c, img_h, img_w}; // nchw - float* input_data = ( float* )malloc(img_size * sizeof(float)); - - tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); - if (input_tensor == NULL) - { - fprintf(stderr, "Get input tensor failed\n"); - return -1; - } - - if (set_tensor_shape(input_tensor, dims, 4) < 0) - { - fprintf(stderr, "Set input tensor shape failed\n"); - return -1; - } - - if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0) - { - fprintf(stderr, "Set input tensor buffer failed\n"); - return -1; - } - - /* prerun graph, set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(graph, opt) < 0) - { - fprintf(stderr, "Prerun multithread graph failed.\n"); - return -1; - } - - std::vector imgs_list; - if (image_dir.c_str() != NULL) - { - readFileList(image_dir, imgs_list); - } - else - { - imgs_list.push_back(image_file); - } - uint32_t img_num = imgs_list.size(); - - /* prepare process input data, set the data mem to input tensor */ - get_input_data_cv(imgs_list[0].c_str(), input_data, img_c, img_h, img_w, mean, scale, - 1, 0, 0, 0, 0); - - /* run graph */ - for (int i = 0; i < loop_count; i++) - { - double start = get_current_time(); - if (run_graph(graph, 1) < 0) - { - fprintf(stderr, "Run graph failed\n"); - return -1; - } - } - - /* get the result of classification */ - tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); - float* output_data = ( float* )get_tensor_buffer(output_tensor); - int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); - -// printf("out put data %f %d \n",output_data[0], output_size); - fprintf(stderr, "--------------------------------------\n"); - - /* release tengine */ - free(input_data); - postrun_graph(graph); - destroy_graph(graph); -// release_tengine(); - - return 0; -} +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "../quant_tool.hpp" + +//int QuantTool::data_free_quant(const char* model_file, const char* image_dir, +// int img_c, int img_h, int img_w, const float* mean, const float* scale, +// int num_thread, int sw_RGB, int center_crop) +int QuantTool::data_free_quant() +{ + int letterbox = 0; + int loop_count = 1; + const char* image_file = nullptr; + + /* set runtime options */ + struct options opt; + opt.num_thread = num_thread; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_FP32; + + // /* inital tengine */ + // if (init_tengine() != 0) + // { + // fprintf(stderr, "Initial tengine failed.\n"); + // return -1; + // } + // fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); + + /* create graph, load tengine model xxx.tmfile */ + graph_t graph = create_graph(NULL, "tengine", model_file.c_str()); + if (NULL == graph) + { + fprintf(stderr, "Create graph failed.\n"); + fprintf(stderr, "errno: %d \n", get_tengine_errno()); + return -1; + } + + struct graph* graphn = (struct graph*)graph; + struct node_graph* node_proto = (struct node_graph*)sys_malloc(sizeof(struct node_graph) * graphn->node_num); + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_type = n->op.type; + const char* layer_name = n->name; //layer name + + const uint16_t input_num = n->input_num; //input num + const uint16_t output_num = n->output_num; //output num + + node_proto[i].pass = 0; + // node_proto[i].input_node_list = create_vector(sizeof(uint16_t), NULL); + // node_proto[i].output_node_list = create_vector(sizeof(uint16_t), NULL); + + for (int j = 0; j < input_num; j++) + { + struct tensor* input_tensor = get_ir_graph_tensor(graphn, n->input_tensors[j]); + const char* input_tensor_name = input_tensor->name; + uint8_t dim_num = input_tensor->dim_num; + + if (input_tensor->producer >= 0) + { + struct node* node = graphn->node_list[input_tensor->producer]; + node_proto[i].input_node_list.push_back(node->index); + node_proto[node->index].output_node_list.push_back(i); + } + if (OP_CONV == op_type || OP_FC == op_type) + { + break; + } + } + } + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_type = n->op.type; + const char* layer_name = n->name; //layer name + if (op_type != NULL) + { + if (OP_CONV != op_type && OP_FC != op_type) + { + if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + uint16_t node_output_id = node_proto[i].output_node_list[0]; + if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1) + { + node_proto[i].input_node_list.erase(node_proto[i].input_node_list.begin() + 0); + node_proto[i].output_node_list.erase(node_proto[i].output_node_list.begin() + 0); + + node_proto[node_input_id].output_node_list.erase(node_proto[node_input_id].output_node_list.begin() + 0); + node_proto[node_input_id].output_node_list.push_back(node_output_id); + + node_proto[node_output_id].input_node_list.erase(node_proto[node_output_id].input_node_list.begin() + 0); + node_proto[node_output_id].input_node_list.push_back(node_input_id); + } + } + } + } + } + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_name = n->op.type; + const char* layer_name = n->name; //layer name + + const uint16_t input_num = n->input_num; //input num + const uint16_t output_num = n->output_num; //output num + + if (op_name != NULL) + { + if (OP_CONV == op_name) + { + // DW_Conv && Direct_Conv + struct conv_param* conv_param = (struct conv_param*)n->op.param_mem; + if (conv_param->group == conv_param->output_channel) + { + // printf(" #### DW Conv ####\n"); + if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + uint16_t node_output_id = node_proto[i].output_node_list[0]; + auto op_name0 = graphn->node_list[node_input_id]->op.type; + auto op_name2 = graphn->node_list[node_input_id]->op.type; + + if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1 && OP_CONV == op_name0 && OP_CONV == op_name2) + { + node_proto[i].pass = 1; //layer1 + node_proto[node_input_id].pass = 1; //layer0 + node_proto[node_output_id].pass = 1; //layer2 + + // layer0 min/max range + struct node* nodeP = graphn->node_list[node_input_id]; + struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + uint16_t dims0 = input_tensor->dims[0]; + uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer0_max(dims0, 0.0f); + std::vector layer0_min(dims0, 0.0f); + std::vector layer0_range(dims0, 0.0f); + + float* data_layer0 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) + layer0_max[d0] = data_layer0[dims123 * d0 + d1]; + if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) + layer0_min[d0] = data_layer0[dims123 * d0 + d1]; + } + } + // printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; + } + + // layer1 min/max range + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer1_max(dims0, 0.0f); + std::vector layer1_min(dims0, 0.0f); + std::vector layer1_range(dims0, 0.0f); + + float* data_layer1 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer1[dims123 * d0 + d1] > layer1_max[d0]) + layer1_max[d0] = data_layer1[dims123 * d0 + d1]; + if (data_layer1[dims123 * d0 + d1] < layer1_max[d0]) + layer1_min[d0] = data_layer1[dims123 * d0 + d1]; + } + } + // printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; + } + + // layer2 min/max range + nodeP = graphn->node_list[node_output_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + uint16_t dims1 = input_tensor->dims[1]; + uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer2_max(dims0, 0.0f); + std::vector layer2_min(dims0, 0.0f); + std::vector layer2_range(dims0, 0.0f); + + float* data_layer2 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer2_max[d1]) + { + layer2_max[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer2_min[d1]) + { + layer2_min[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + } + } + } + // printf("### %d ###\n",dims1); + for (int d1 = 0; d1 < dims1; d1++) + { + layer2_range[d1] = layer2_max[d1] - layer2_min[d1]; + } + + ////////////////////////////////////////////////////////////////////////////////// + + // layer ops sqrt + float ops_range[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + ops_range[ops] = pow(layer0_range[ops] * layer1_range[ops] * layer2_range[ops], 1.0 / 3); + } + + float S01[dims1]; + float S01_F[dims1]; + float S12[dims1]; + float S12_F[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + if (ops_range[ops] == 0) + { + S01[ops] = 0.0; + S12_F[ops] = 0.0; + } + else + { + S01[ops] = layer0_range[ops] / ops_range[ops]; + S12_F[ops] = layer2_range[ops] / ops_range[ops]; + } + if (layer0_range[ops] == 0) + S01_F[ops] = 0.0; + else + S01_F[ops] = ops_range[ops] / layer0_range[ops]; + if (layer2_range[ops] == 0) + S12[ops] = 0.0; + else + S12[ops] = ops_range[ops] / layer2_range[ops]; + } + ////////////////////////////////////////////////////////////////////////////////// + + // layer0 output + nodeP = graphn->node_list[node_input_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer0_bias = (float*)sys_malloc(sizeof(float) * dims0); + data_layer0_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; + } + + // layer1 output + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer1[dims123 * d0 + d1] = data_layer1[dims123 * d0 + d1] * S01[d0] * S12_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer1_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer1_bias[d0] = data_layer1_bias[d0] * S12_F[d0]; + } + + // layer2 output + nodeP = graphn->node_list[node_output_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims1 = input_tensor->dims[1]; + dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] * S12[d1]; + } + } + } + } + } + } + else + { + // printf(" #### Direct Conv ####\n"); + if (node_proto[i].pass == 0) + { + if (node_proto[i].input_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + if (graphn->node_list[node_input_id]->input_num > 0) + { + auto op_name0 = graphn->node_list[node_input_id]->op.type; + + if (node_proto[node_input_id].output_node_list.size() == 1 && op_name0 == OP_CONV) + { + node_proto[i].pass = 1; //layer1 + node_proto[node_input_id].pass = 1; //layer0 + + // layer0 min/max range + struct node* nodeP = graphn->node_list[node_input_id]; + struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + uint16_t dims0 = input_tensor->dims[0]; + uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer0_max(dims0, 0.0f); + std::vector layer0_min(dims0, 0.0f); + std::vector layer0_range(dims0, 0.0f); + + float* data_layer0 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) + layer0_max[d0] = data_layer0[dims123 * d0 + d1]; + if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) + layer0_min[d0] = data_layer0[dims123 * d0 + d1]; + } + } + // printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; + } + + // layer1 min/max range + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + uint16_t dims1 = input_tensor->dims[1]; + uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer1_max(dims0, 0.0f); + std::vector layer1_min(dims0, 0.0f); + std::vector layer1_range(dims0, 0.0f); + + float* data_layer1 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer1_max[d1]) + { + layer1_max[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer1_min[d1]) + { + layer1_min[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + } + } + } + // printf("### %d ###\n",dims1); + for (int d0 = 0; d0 < dims1; d0++) + { + layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; + } + + ////////////////////////////////////////////////////////////////////////////////// + + // layer ops sqrt + float ops_range[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + ops_range[ops] = sqrt(layer0_range[ops] * layer1_range[ops]); + } + + float S01[dims1]; + float S01_F[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + if (ops_range[ops] == 0) + { + S01[ops] = 0.0; + } + else + { + S01[ops] = layer0_range[ops] / ops_range[ops]; + } + if (layer0_range[ops] == 0) + S01_F[ops] = 0.0; + else + S01_F[ops] = ops_range[ops] / layer0_range[ops]; + } + ////////////////////////////////////////////////////////////////////////////////// + // layer0 output + nodeP = graphn->node_list[node_input_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer0_bias = (float*)sys_malloc(sizeof(float) * dims0); + data_layer0_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; + } + + // layer1 output + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims1 = input_tensor->dims[1]; + dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] * S01[d1]; + } + } + } + } + } + } + } + } + } + } + } + + if (!save_graph(graph, "test_dfq_fp32.tmfile")) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + /* set the shape, data buffer of input_tensor of the graph */ + int img_size = img_h * img_w * img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if (input_tensor == NULL) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(graph, opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + std::vector imgs_list; + if (image_dir.c_str() != NULL) + { + readFileList(image_dir, imgs_list); + } + else + { + imgs_list.push_back(image_file); + } + uint32_t img_num = imgs_list.size(); + + /* prepare process input data, set the data mem to input tensor */ + get_input_data_cv(imgs_list[0].c_str(), input_data, img_c, img_h, img_w, mean, scale, + 1, 0, 0, 0, 0); + + /* run graph */ + for (int i = 0; i < loop_count; i++) + { + double start = get_current_time(); + if (run_graph(graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + } + + /* get the result of classification */ + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + float* output_data = (float*)get_tensor_buffer(output_tensor); + int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); + + // printf("out put data %f %d \n",output_data[0], output_size); + fprintf(stderr, "--------------------------------------\n"); + + /* release tengine */ + free(input_data); + postrun_graph(graph); + destroy_graph(graph); + // release_tengine(); + + return 0; +} diff --git a/tools/quantize/algorithm/quant_eq.cpp b/tools/quantize/algorithm/quant_eq.cpp index 9d2b31178..b8e80fe6b 100644 --- a/tools/quantize/algorithm/quant_eq.cpp +++ b/tools/quantize/algorithm/quant_eq.cpp @@ -1,1297 +1,1283 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: hhchen@openailab.com - */ - -#include "../quant_tool.hpp" - -int QuantTool::init() -{ - // ir graph variable - this->fp32_out.clear(); - this->fake_quant_out.clear(); - - /* load fp32 graph and fake quant graph */ - this->graphn_fp32 = ( struct graph* )create_graph(nullptr, "tengine", this->model_file.c_str()); - this->graphn_fake_quant = ( struct graph* )create_graph(nullptr, "tengine", this->model_file.c_str()); - - if (this->graphn_fp32 == nullptr || this->graphn_fake_quant == nullptr) - { - fprintf(stderr, "Create graph failed.\n"); - fprintf(stderr, "errno: %d \n", get_tengine_errno()); - return -1; - } - - /* load activation scale to ir_tensor */ - this->load_activation_scale(this->graphn_fp32, this->scale_file.c_str(), this->inplace); - this->load_activation_scale(this->graphn_fake_quant, this->scale_file.c_str(), this->inplace); - - /* get graph input tensor */ - this->graph_input_tensor_fp32 = ( struct tensor* )get_graph_input_tensor(( void* )this->graphn_fp32, 0, 0); - this->graph_input_tensor_fake_quant = - ( struct tensor* )get_graph_input_tensor(( void* )this->graphn_fake_quant, 0, 0); - if (this->graph_input_tensor_fp32 == nullptr || this->graph_input_tensor_fake_quant == nullptr) - { - fprintf(stderr, "Get input tensor failed\n"); - return -1; - } - - /* generate images list */ - std::vector imgs_list; - if (!this->image_dir.empty()) - readFileList(this->image_dir, imgs_list); - uint32_t img_num = imgs_list.size(); - - this->max_search_img_num = 50; - if (img_num < this->max_search_img_num) - this->max_search_img_num = img_num; - -// fprintf(stderr, "# eq dataset num %d\n", this->max_search_img_num); - - /* set the shape, data buffer of input_tensor of the graph */ - this->img_size = this->img_h * this->img_w * this->img_c; - int dims[] = {1, img_c, img_h, img_w}; // nchw - float* input_data_fp32 = ( float* )malloc(this->img_size * sizeof(float)); - float* input_data_fake_quant = ( float* )malloc(this->img_size * sizeof(float)); - - /* prepare process input data, set the data mem to input tensor */ - float scale_graph_input = this->graph_input_tensor_fake_quant->scale; - int zero_point_graph_input = this->graph_input_tensor_fake_quant->zero_point; -// fprintf(stderr, "scale zp %f %d\n", scale_graph_input, zero_point_graph_input); - - this->input_datas_fp32.resize(this->max_search_img_num); - this->input_datas_fake_quant.resize(this->max_search_img_num); - - for (int i = 0; i < this->max_search_img_num; i++) - { - this->input_datas_fp32[i].resize(this->img_size); - this->input_datas_fake_quant[i].resize(this->img_size); - - get_input_data_cv(imgs_list[i].c_str(), this->input_datas_fp32[i].data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); - - - this->input_datas_fake_quant[i] = this->input_datas_fp32[i]; - this->activation_requant(this->input_datas_fake_quant[i].data(), this->img_size, 8, 1, scale_graph_input, - zero_point_graph_input); - } - - /* set graph input shape */ - int ret_fp32 = set_tensor_shape(this->graph_input_tensor_fp32, dims, 4); - int ret_fake_quant = set_tensor_shape(this->graph_input_tensor_fake_quant, dims, 4); - if (ret_fp32 < 0 || ret_fake_quant < 0) - { - fprintf(stderr, "Set input tensor shape failed\n"); - return -1; - } - - /* set graph input buffer */ - ret_fp32 = set_tensor_buffer(this->graph_input_tensor_fp32, input_data_fp32, this->img_size * 4); - ret_fake_quant = set_tensor_buffer(this->graph_input_tensor_fake_quant, input_data_fake_quant, this->img_size * 4); - if (ret_fp32 < 0 || ret_fake_quant < 0) - { - fprintf(stderr, "Set input tensor buffer failed\n"); - return -1; - } - - /* prerun graph, set work options(num_thread, cluster, precision) */ - if (prerun_graph_multithread(( void* )this->graphn_fp32, this->opt) < 0) - { - fprintf(stderr, "Prerun multithread graph failed.\n"); - return -1; - } - ret_fp32 = prerun_graph_multithread((void*)this->graphn_fp32, this->opt); - ret_fake_quant = prerun_graph_multithread((void*)this->graphn_fake_quant, this->opt); - if (ret_fp32 < 0 || ret_fake_quant < 0) - { - fprintf(stderr, "Prerun multithread graph failed.\n"); - return -1; - } - - /* get exec graph */ - this->exec_graph_fp32 = this->get_exec_graph(this->graphn_fp32); - this->exec_graph_fake_quant = this->get_exec_graph(this->graphn_fake_quant); - this->exec_node_num = get_vector_num(this->exec_graph_fp32->exec_node_list); - - /* ir idx <<<->>> exec idx */ - for (int i = 0; i < this->exec_node_num; i++) - { - this->node_fp32 = ( struct exec_node* )get_vector_data(this->exec_graph_fp32->exec_node_list, i); - this->node_fake_quant = ( struct exec_node* )get_vector_data(this->exec_graph_fake_quant->exec_node_list, i); - - int out_t = node_fp32->ir_node->output_tensors[0]; - this->ir_exec[graphn_fp32->tensor_list[out_t]->producer] = i; // ir idx --> exec idx - this->exec_ir[i] = graphn_fp32->tensor_list[out_t]->producer; // exec idx --> ir idx -// printf(" %d : %d\n", graphn_fp32->tensor_list[out_t]->producer, i); - } - - /* check for free node*/ - this->check_for_free(); - - return 0; -} - -void QuantTool::activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point) -{ -// symmetry = 0; - float fake_quant_max; - float fake_quant_min; - - if (symmetry == 1) - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = -fake_quant_max; - } - else - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = 0; - } - - for (int i = 0; i < elem_num; i++) - { - data[i] = round(data[i] / scale) + zero_point; - data[i] = data[i] > fake_quant_max ? fake_quant_max : data[i]; - data[i] = data[i] < fake_quant_min ? fake_quant_min : data[i]; - data[i] = (data[i] - zero_point) * scale; - } -} - -void QuantTool::recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, - dict_str2int &layer_used, dict_str2float &layer_scale, dict_str2float &layer_zeropoint, dict_str2int &layer_pass) -{ - if (layer_pass[t->name] == 0 && layer_used[t->name] < 2) - { - t->scale = layer_scale[layer_name]; - t->zero_point = layer_zeropoint[layer_name]; - layer_scale[t->name] = layer_scale[layer_name]; - layer_zeropoint[t->name] = layer_zeropoint[layer_name]; - - uint32_t ir_node_idx = t->producer; - struct node* t_node = graphn->node_list[ir_node_idx]; - - auto op_name = t_node->op.type; - bool poolTrue = false; - bool reluTrue = false; - if (op_name == OP_POOL) - { - struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; - if (pool_param->pool_method == 0) - poolTrue = true; - } - else if (op_name == OP_RELU) - { - struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; - if (relu_param->negative_slope == 0.f) - reluTrue = true; - } - if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || - poolTrue || reluTrue) - { - struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; - if (layer_scale[t->name] != 0) - { - if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) - { - QuantTool::recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); - } - } - } - layer_pass[t->name] = 1; - } -} - -struct exec_graph* QuantTool::get_exec_graph(struct graph* graphn) -{ - struct subgraph* subgraph = get_ir_graph_subgraph(graphn, 0); - struct exec_graph* exec_graph = ( struct exec_graph* )subgraph->device_graph; - - return exec_graph; -} - -void QuantTool::check_for_free() -{ - dict_uint2uint nodeA2B; - for (int i = 0; i < this->exec_node_num; i++) - { - this->node_fp32 = ( struct exec_node* )get_vector_data(this->exec_graph_fp32->exec_node_list, i); - this->op_name = this->node_fp32->ir_node->op.type; - - for (int j = 0; j < this->node_fp32->ir_node->input_num; j++) - { - struct tensor* t = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[j]]; - if (t->tensor_type == 1) - { - uint32_t ir_idx = t->producer; - nodeA2B[this->ir_exec[ir_idx]] = i; - } - } - } - - for (auto iter = nodeA2B.begin(); iter != nodeA2B.end(); iter++) - { - this->dict_free[iter->second].push_back(iter->first); -// printf(" map %d %d\n", iter->first, iter->second); - } -} - -void QuantTool::check_for_interlearve() -{ - if (this->op_name == OP_CONV || this->op_name == OP_FC) - { - /* get weight tensor */ - this->weight_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[1]]; - this->weight_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[1]]; - this->weight_size = this->weight_tensor_fp32->elem_num * this->weight_tensor_fp32->elem_size; - - this->weight_data_fp32 = (float*)this->weight_tensor_fp32->data; - this->weight_data_fake_quant = (float*)this->weight_tensor_fake_quant->data; - - if (this->op_name == OP_CONV) - { - this->conv_param_fp32 = ( struct conv_param* )this->node_fp32->ir_node->op.param_mem; - this->conv_param_fake_quant = ( struct conv_param* )this->node_fake_quant->ir_node->op.param_mem; - - if (this->conv_param_fp32->group != this->conv_param_fp32->output_channel) - { - this->conv_priv_info_fp32 = ( struct conv_priv_info* )this->node_fp32->ops_priv; - this->conv_priv_info_fake_quant = ( struct conv_priv_info* )this->node_fake_quant->ops_priv; - - this->interleave_size_fake = this->conv_priv_info_fp32->interleave_buffer_pack4_size; - - this->interleave_buffer_fp32 = ( float* )this->conv_priv_info_fp32->interleave_buffer_pack4; - this->interleave_buffer_fake_quant = ( float* )this->conv_priv_info_fake_quant->interleave_buffer_pack4; - } - } - else - this->interleave_size_fake = 0; - } -} - - -void QuantTool::weight_bias_requant(int search) -{ - /* weight requant */ -// printf("### 1.1 this->weight_tensor_fake_quant->scale %f\n",this->weight_tensor_fake_quant->scale); - if (0 == search) - this->weight_requant(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, this->weight_tensor_fake_quant->dims[0]); - - if (this->interleave_size_fake != 0) - { - int M = this->weight_tensor_fake_quant->dims[0]; - int K = this->weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; - this->conv_hcl_interleave_pack4_fp32(M, K, this->weight_data_fake_quant, this->interleave_buffer_fake_quant); - } - - /* bias requant */ - if (this->node_fake_quant->ir_node->input_num > 2) - { - this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[0]]; - this->bias_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[2]]; - this->bias_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[2]]; - this->bias_size = this->bias_tensor_fp32->elem_num * this->bias_tensor_fp32->elem_size; - this->bias_data_fp32 = (float*)this->bias_tensor_fp32->data; - this->bias_data_fake_quant = (float*)this->bias_tensor_fake_quant->data; - this->bias_requant(this->input_tensor_fake_quant, this->weight_tensor_fake_quant, this->bias_tensor_fake_quant, - this->bias_data_fake_quant, this->bias_tensor_fake_quant->elem_num, this->bias_tensor_fake_quant->dims[0]); -// this->bias_tensor_fp32->scale = this->bias_tensor_fake_quant->scale; - } -} - -void QuantTool::set_node_input_output_tensor(int idx, int imgi, int snum) -{ - this->out_imgs_fp32[imgi].resize(this->output_tensor_fp32->elem_num); - this->out_imgs_fake_quant[imgi].resize(this->output_tensor_fp32->elem_num); - - if (idx == 0) - { - set_tensor_buffer(this->graph_input_tensor_fp32, this->input_datas_fp32[imgi].data(), this->img_size * 4); - set_tensor_buffer(this->graph_input_tensor_fake_quant, this->input_datas_fake_quant[imgi].data(), this->img_size * 4); - } - else - { - for (int inputi = 0; inputi < this->node_fp32->ir_node->input_num; inputi++) - { - uint32_t ir_input_tensor_idx = this->node_fp32->ir_node->input_tensors[inputi]; - this->input_tensor_fp32 = this->graphn_fp32->tensor_list[ir_input_tensor_idx]; - this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[ir_input_tensor_idx]; - - if (this->input_tensor_fp32->tensor_type == 1) - { - uint32_t ir_node_idx = this->input_tensor_fp32->producer; - uint32_t input_size = this->input_tensor_fp32->elem_num * input_tensor_fp32->elem_size; - - uint32_t exec_node_idx = this->ir_exec[ir_node_idx]; - - if (imgi == 0 && snum == 0) - { - float* buf_fp32 = (float*)sys_malloc(32); - float* buf_fake_quant = (float*)sys_malloc(32); - - set_tensor_buffer(this->input_tensor_fp32, buf_fp32, input_size); - set_tensor_buffer(this->input_tensor_fake_quant, buf_fake_quant, input_size); - - set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); - set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); - } - else - { - set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); - set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); - } - } // output tensor - } // node input number - } // node i > 0 - - /* init output buffer */ - set_tensor_buffer(this->output_tensor_fp32, this->out_imgs_fp32[imgi].data(), this->output_tensor_fp32->elem_num * this->output_tensor_fp32->elem_size); - set_tensor_buffer(this->output_tensor_fake_quant, this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num * this->output_tensor_fake_quant->elem_size); -} - -double QuantTool::cosin_similarity(std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num) -{ - double norm_a=0; - double norm_b=0; - double a_b=0; - - uint32_t fnum = (output_num >> 4) << 4; - uint32_t rnum = output_num - fnum; - -#if 0 //__AVX__ - - float _sumaa0[8] = {0.f}; - float _sumbb0[8] = {0.f}; - float _sumaabb0[8] = {0.f}; - float _sumaa1[8] = {0.f}; - float _sumbb1[8] = {0.f}; - float _sumaabb1[8] = {0.f}; - - __m256 _suma_o0 = _mm256_set1_ps(0.0); - __m256 _sumb_o0 = _mm256_set1_ps(0.0); - __m256 _sumab_o0 = _mm256_set1_ps(0.0); - __m256 _suma_o1 = _mm256_set1_ps(0.0); - __m256 _sumb_o1 = _mm256_set1_ps(0.0); - __m256 _sumab_o1 = _mm256_set1_ps(0.0); - - for (int i = 0; i < imgs_num; i++) - { - const float* in_a_addr = in_a[i].data(); - const float* in_b_addr = in_b[i].data(); - for (int j = 0; j < fnum; j=j+32) - { - __m256 _in_a0 = _mm256_loadu_ps(in_a_addr+j); - __m256 _in_b0 = _mm256_loadu_ps(in_b_addr+j); - __m256 _in_a1 = _mm256_loadu_ps(in_a_addr+j+8); - __m256 _in_b1 = _mm256_loadu_ps(in_b_addr+j+8); - - _suma_o0 = _mm256_fmadd_ps(_in_a0, _in_a0, _suma_o0); - _sumb_o0 = _mm256_fmadd_ps(_in_b0, _in_b0, _sumb_o0); - _sumab_o0 = _mm256_fmadd_ps(_in_a0, _in_b0, _sumab_o0); - _suma_o1 = _mm256_fmadd_ps(_in_a1, _in_a1, _suma_o1); - _sumb_o1 = _mm256_fmadd_ps(_in_b1, _in_b1, _sumb_o1); - _sumab_o1 = _mm256_fmadd_ps(_in_a1, _in_b1, _sumab_o1); - } - } - _mm256_storeu_ps(_sumaa0, _suma_o0); - _mm256_storeu_ps(_sumbb0, _sumb_o0); - _mm256_storeu_ps(_sumaabb0, _sumab_o0); - _mm256_storeu_ps(_sumaa1, _suma_o1); - _mm256_storeu_ps(_sumbb1, _sumb_o1); - _mm256_storeu_ps(_sumaabb1, _sumab_o1); - - for (int i = 0; i < 8; i++) - { - norm_a += _sumaa0[i] + _sumaa1[i]; - norm_b += _sumbb0[i] + _sumbb1[i]; - a_b += _sumaabb0[i] + _sumaabb1[i]; - - } - -#else // normal -// printf("AAAA DIRECT\n"); - for (int i = 0; i < imgs_num; i++) - { - for (int j = 0; j < fnum; j=j+8) - { - for (int k = 0; k < 8; k=k+1) - { - norm_a += in_a[i][j+k] * in_a[i][j+k]; - - norm_b += in_b[i][j+k] * in_b[i][j+k]; - - a_b += in_a[i][j+k] * in_b[i][j+k]; - } - } - } - -#endif // __SSE__ __AVX__ - - for (int j = fnum; j < output_num; j++) - { - for (int i = 0; i < imgs_num; i++) - { - norm_a += in_a[i][j] * in_a[i][j]; - norm_b += in_b[i][j] * in_b[i][j]; - a_b += in_a[i][j] * in_b[i][j]; - } - } - - double cosin=0.0; - double _a_b_ = sqrt(norm_a) * sqrt(norm_b); - if(_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) - cosin = a_b; - else - cosin = a_b/_a_b_; - if (cosin < -999999 || cosin > 999999) - cosin = 0; - return cosin; -} - -double QuantTool::cosin_similarity(std::vector* in_a,std::vector* in_b, uint32_t imgs_num, uint32_t output_num) -{ - uint32_t output_channel = 1; - std::vector norm_a(output_channel, 0.0); - std::vector norm_b(output_channel, 0.0); - std::vector a_b(output_channel, 0.0); - - int elem_perchannel = int(output_num / output_channel); - - for (int i = 0; i < imgs_num; i++) - { - for (int j = 0; j < output_channel; j++) - { - for (int k = 0; k < elem_perchannel; k++) - { - int elem_idx = j * elem_perchannel + k; - norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; - norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; - a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; - } - } - } - - double cosin; - for (int j = 0; j < output_channel; j++) - { - double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); - if(_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) - cosin = a_b[j]; - else - cosin = a_b[j]/_a_b_; - if (cosin < -999999 || cosin > 999999) - cosin = 0; - } - return cosin; -} - -void QuantTool::weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) -{ - float* scale_list = (float *)sys_malloc(elem_channel * 4); - int* zero_point_list = (int *)sys_malloc(elem_channel * 4); - - int elem_perchannel = elem_num / elem_channel; - - float fake_quant_max; - float fake_quant_min; - - if (symmetry == 1) - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = -fake_quant_max; - } - else - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = 0; - } - - float scale = 1; - int zero_point = 0; - for (int c = 0; c < elem_channel; c++) - { - float weight_max = *std::max_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); - float weight_min = *std::min_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); - if (symmetry == 1) - { - if (abs(weight_max) > abs(weight_min)) - scale = abs(weight_max)/fake_quant_max; - else - scale = abs(weight_min)/fake_quant_max; - zero_point = 0; - } - else - { - scale = (weight_max - weight_min)/fake_quant_max; - zero_point = int(- weight_min / scale); - } - - scale_list[c] = scale; - zero_point_list[c] = zero_point; - } - - if (weight_tensor->scale_list == NULL) - { -// printf(" EMPTY\n "); - weight_tensor->scale_list = scale_list; - weight_tensor->zp_list = zero_point_list; - } - else - { - scale_list = weight_tensor->scale_list; - zero_point_list = weight_tensor->zp_list; - } - - int data_idx; - for (int i = 0; i < elem_channel; i++) - { - for (int j = 0; j < elem_perchannel; j++) - { - data_idx = i*elem_perchannel + j; - if (scale_list[i] == 0) - data[data_idx] = 0; - else - { - data[data_idx] = round(data[data_idx] / scale_list[i]) + zero_point_list[i]; - data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; - data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; - data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale_list[i]; - } - } - } -} - -void QuantTool::conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t) -{ - int nn_outch = M >> 3; - int remain_outch_start = nn_outch << 3; - - for (int pp = 0; pp < nn_outch; pp++) - { - int p = pp * 8; - - const float* k0 = pA + (p + 0) * K; - const float* k1 = pA + (p + 1) * K; - const float* k2 = pA + (p + 2) * K; - const float* k3 = pA + (p + 3) * K; - const float* k4 = pA + (p + 4) * K; - const float* k5 = pA + (p + 5) * K; - const float* k6 = pA + (p + 6) * K; - const float* k7 = pA + (p + 7) * K; - - float* ktmp = pA_t + (p / 8) * 8 * K; - - for (int q = 0; q < K; q++) - { - ktmp[0] = k0[0]; - ktmp[1] = k1[0]; - ktmp[2] = k2[0]; - ktmp[3] = k3[0]; - ktmp[4] = k4[0]; - ktmp[5] = k5[0]; - ktmp[6] = k6[0]; - ktmp[7] = k7[0]; - ktmp += 8; - - k0 += 1; - k1 += 1; - k2 += 1; - k3 += 1; - k4 += 1; - k5 += 1; - k6 += 1; - k7 += 1; - } - } - - nn_outch = (M - remain_outch_start) >> 2; - for (int pp = 0; pp < nn_outch; pp++) - { - int p = remain_outch_start + pp * 4; - - const float* k0 = pA + (p + 0) * K; - const float* k1 = pA + (p + 1) * K; - const float* k2 = pA + (p + 2) * K; - const float* k3 = pA + (p + 3) * K; - - float* ktmp = pA_t + (p / 8 + (p % 8) / 4) * 8 * K; - - for (int q = 0; q < K; q++) - { - ktmp[0] = k0[0]; - ktmp[1] = k1[0]; - ktmp[2] = k2[0]; - ktmp[3] = k3[0]; - ktmp += 4; - - k0 += 1; - k1 += 1; - k2 += 1; - k3 += 1; - } - } - - remain_outch_start += nn_outch << 2; - - for (int p = remain_outch_start; p < M; p++) - { - const float* k0 = pA + (p + 0) * K; - - float* ktmp = pA_t + (p / 8 + (p % 8) / 4 + p % 4) * 8 * K; - - for (int q = 0; q < K; q++) - { - ktmp[0] = k0[0]; - ktmp++; - k0++; - } - } -} - -void QuantTool::gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) -{ - float* scale_list = (float *)sys_malloc(elem_channel * 4); - int* zero_point_list = (int *)sys_malloc(elem_channel * 4); - - int elem_perchannel = elem_num / elem_channel; - - float fake_quant_max; - float fake_quant_min; - - if (symmetry == 1) - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = -fake_quant_max; - } - else - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = 0; - } - - float scale = 1; - int zero_point = 0; - for (int c = 0; c < elem_channel; c++) - { - float weight_max = *std::max_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); - float weight_min = *std::min_element(data + c*elem_perchannel, data + (c+1)*elem_perchannel); - if (symmetry == 1) - { - if (abs(weight_max) > abs(weight_min)) - scale = abs(weight_max)/fake_quant_max; - else - scale = abs(weight_min)/fake_quant_max; - zero_point = 0; - } - else - { - scale = (weight_max - weight_min)/fake_quant_max; - zero_point = int(- weight_min / scale); - } - - scale_list[c] = scale; - zero_point_list[c] = zero_point; - } - - weight_tensor->scale_list = scale_list; - weight_tensor->zp_list = zero_point_list; -} - -void QuantTool::bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, - float* data, int elem_num, int elem_channel) -{ - int elem_perchannel = elem_num / elem_channel; - float* scale_list = (float *)sys_malloc(elem_channel * 4); - - for (int c = 0; c < elem_channel; c++) - { - float input_scale = input_tensor->scale; - float weight_scale = weight_tensor->scale_list[c]; - float bias_scale = input_scale * weight_scale; - scale_list[c] = bias_scale; - } - - bias_tensor->scale_list = scale_list; - - int data_idx; - for (int i = 0; i < elem_channel; i++) - { - for (int j = 0; j < elem_perchannel; j++) - { - data_idx = i*elem_perchannel + j; - if (scale_list[i] == 0) - { - data[data_idx] = 0; - } - else - { - data[data_idx] = round(data[data_idx] / scale_list[i]); - data[data_idx] = data[data_idx] * scale_list[i]; - } - } - } - -} - - -void QuantTool::weight_bias_reset() -{ - if (this->op_name == OP_CONV || this->op_name == OP_FC) - { - std::memcpy(this->weight_data_fake_quant, this->weight_data_fp32, this->weight_size); - std::memcpy(this->interleave_buffer_fake_quant, this->interleave_buffer_fp32, this->interleave_size_fake); - if (this->node_fake_quant->ir_node->input_num > 2) - { - memcpy(this->bias_data_fake_quant, this->bias_data_fp32, this->bias_size); - } - } -} - -void QuantTool::free_used_layers(int idx) -{ -// printf("#### free 0 idx %d\n",idx); - if (this->dict_free[idx].size() > 0) - { -// printf("#### free 1 idx %d\n",idx); - std::vector > freen_fp32; - std::vector > freen_fake_quant; - for (int fi = 0; fi < this->dict_free[idx].size(); fi++) - { - if (this->dict_free[idx][fi] != 0) - { -// printf("---free---\n"); - this->fp32_out[this->dict_free[idx][fi] ].clear(); - this->fake_quant_out[this->dict_free[idx][fi] ].clear(); - } - } - } -} - - -void QuantTool::load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc) -{ - std::unordered_map layer_scale; - std::unordered_map layer_zeropoint; - bool parse_from_file = false; - if (nullptr != scale_file) - { - std::ifstream scales(scale_file); - std::string line; - while (std::getline(scales, line)) - { - std::string layer_name; - float scale_val = 0.f; - float zero_point = 0.f; - size_t last = 0; - size_t index = line.find_first_of(" ", last); - size_t idx = line.find_last_of(" ", line.size()); - layer_name = line.substr(last, index - last); - // printf("layer_name : %s \n", layer_name.c_str()); - last = index + 1; - scale_val = atof((line.substr(last, line.size() - last)).c_str()); - zero_point = atof((line.substr(idx + 1, line.size())).c_str()); - - layer_scale[layer_name] = scale_val; - layer_zeropoint[layer_name] = zero_point; - // fprintf(stderr, "quant value : %s %f %f \n", layer_name.c_str(), scale_val, zero_point); - } - } - - std::unordered_map layer_used; - for (int i = 0; i < graphn->node_num; i++) - { - struct node* noden = graphn->node_list[i]; - for (int j = 0; j < noden->input_num; j++) - { - std::string layern = graphn->tensor_list[noden->input_tensors[j]]->name; - layer_used[layern]++; - } - } - - if (mode_sc == 0) - { - for (int i = 0; i < graphn->tensor_num; i++) - { - struct tensor* t = graphn->tensor_list[i]; - if (t->tensor_type == 1 || t->tensor_type == 3) - { - t->scale = layer_scale[t->name]; - t->zero_point = layer_zeropoint[t->name]; - } - } - } - else - { - std::unordered_map layer_pass; - for (int i = graphn->tensor_num - 1; i >= 0; i--) - { - struct tensor* t = graphn->tensor_list[i]; - if (t->tensor_type == 1 || t->tensor_type == 3) - { - if (layer_pass[t->name] == 0) - { - uint32_t ir_node_idx = t->producer; - struct node* t_node = graphn->node_list[ir_node_idx]; - - auto op_name = t_node->op.type; - - bool poolTrue = false; - bool reluTrue = false; - if (op_name == OP_POOL) - { - struct pool_param* pool_param = ( struct pool_param* )t_node->op.param_mem; - if (pool_param->pool_method == 0) - poolTrue = true; - } - else if (op_name == OP_RELU) - { - struct relu_param* relu_param = ( struct relu_param* )t_node->op.param_mem; - if (relu_param->negative_slope == 0.f) - reluTrue = true; - } - - if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || - poolTrue || reluTrue) - { - struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; - if (layer_scale[t->name] != 0) - { - t->scale = layer_scale[t->name]; - t->zero_point = layer_zeropoint[t->name]; - - if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) - { - this->recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, - layer_zeropoint, layer_pass); - } - } - } - else - { - t->scale = layer_scale[t->name]; - t->zero_point = layer_zeropoint[t->name]; - } - layer_pass[t->name] = 1; - } - } - } - } - - // for (int i = 0; i < graphn->tensor_num; i++) - // { - // struct ir_tensor* t = graphn->tensor_list[i]; - // if (t->tensor_type == 1 || t->tensor_type == 3) - // { - // printf(" sz %s %f %d \n",t->name, t->scale, t->zero_point); - // } - // } -} - - -int QuantTool::get_exec_node_message(int exec_node_idx) -{ - /* get node */ - this->node_fp32 = ( struct exec_node* )get_vector_data(this->exec_graph_fp32->exec_node_list, exec_node_idx); - this->node_fake_quant = ( struct exec_node* )get_vector_data(this->exec_graph_fake_quant->exec_node_list, exec_node_idx); - - /* get op type */ - this->op_name = this->node_fp32->ir_node->op.type; - - /* get exec ops */ - this->node_ops_fp32 = this->node_fp32->node_ops; - this->node_ops_fake_quant = this->node_fake_quant->node_ops; - - /* handle the shape changed and dynamic shape case */ - if (this->node_ops_fp32->reshape && this->node_ops_fp32->reshape(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32) - && this->node_ops_fake_quant->reshape && this->node_ops_fake_quant->reshape(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant) < 0) - { - TLOG_ERR("failed to reshape node %d, %s\n", node_fp32->ir_node->index, node_fp32->ir_node->name); - return -1; - } - - /* get output tensor */ - this->output_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->output_tensors[0]]; - this->output_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->output_tensors[0]]; - - /* get exec ops */ - this->execidx_elemnum[exec_node_idx] = this->output_tensor_fp32->elem_num; //exec idx --> output elem num - this->execidx_elemsize[exec_node_idx] = this->output_tensor_fp32->elem_size; //exec idx --> output elem size - this->execidx_nodename[exec_node_idx] = this->output_tensor_fp32->name; //exec idx --> output tensor name - - return 0; -} - -void QuantTool::cosin_similarity(std::vector &cosin, std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel) // cosin dis perchannel -{ -// fprintf(stderr, " in_a %f ",in_a[0][0]); -// fprintf(stderr, " in_b %f ",in_b[0][0]); - - std::vector norm_a(output_channel, 0.0); - std::vector norm_b(output_channel, 0.0); - std::vector a_b(output_channel, 0.0); - - int elem_perchannel = int(output_num / output_channel); - - for (int i = 0; i < imgs_num; i++) - { - for (int j = 0; j < output_channel; j++) - { - for (int k = 0; k < elem_perchannel; k++) - { - int elem_idx = j * elem_perchannel + k; - norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; - norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; - a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; - - } - } - } - - cosin.resize(output_channel); - for (int j = 0; j < output_channel; j++) - { - double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); -// fprintf(stderr, " %lf %f %f \n ", _a_b_, sqrt(norm_a[j]), sqrt(norm_b[j]) ); - if(_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) - cosin[j] = a_b[j]; - else - cosin[j] = a_b[j]/_a_b_; - if (cosin[j] < -999999 || cosin[j] > 999999) - cosin[j] = 0; - } -} - -int QuantTool::assess_quant_loss(int gen) -{ - this->init(); - for (int i = 0; i < this->exec_node_num; i++) - { - this->get_exec_node_message(i); - this->check_for_interlearve(); - - this->out_imgs_fp32.resize(this->max_search_img_num); - this->out_imgs_fake_quant.resize(this->max_search_img_num); - if (this->op_name == OP_CONV || this->op_name == OP_FC) - this->weight_bias_requant(gen); - - for (int imgi = 0; imgi < this->max_search_img_num; imgi++) - { - this->set_node_input_output_tensor(i, imgi, 0); - - /* op run */ - this->node_ops_fp32->run(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32); - this->node_ops_fake_quant->run(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant); - this->activation_requant(this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num, 8, 1, this->output_tensor_fake_quant->scale, this->output_tensor_fake_quant->zero_point); - } - - if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1) ) - this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], this->weight_tensor_fp32->dims[0]); - else - this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); - - if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) - this->print_cosin(this->cosin.data(), i, this->weight_tensor_fp32->dims[0]); - else - this->print_cosin(this->cosin.data(), i, 1); -// fprintf(stderr, "cosin [%s] : %f\n", execidx_nodename[i].c_str(), cosin); - - this->weight_bias_reset(); - this->free_used_layers(i); - - /* save node output */ - this->fp32_out.push_back(this->out_imgs_fp32); - this->fake_quant_out.push_back(this->out_imgs_fake_quant); - } - - return 0; -} - -void QuantTool::print_cosin(double* cosin, int idx, int output_channel) -{ - float avg_cosin = 0; - float avg_num = 0; - for (int c = 0; c < output_channel; c++) - { - if (cosin[c] != 0) - { - avg_cosin += cosin[c]; - avg_num ++; - } - } - fprintf(stderr, "cosin %3d %4d avg %0.6f ### ", idx, output_channel, avg_cosin/avg_num); - for (int c = 0; c < output_channel; c++) - { - fprintf(stderr, "%0.6f ",cosin[c]); - } - fprintf(stderr, "\n"); -} - -void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom) -{ - float* scale_list = (float *)weight_tensor->scale_list; - int* zero_point_list = (int *)weight_tensor->zp_list; - - int elem_perchannel = elem_num / elem_channel; - - float fake_quant_max; - float fake_quant_min; - - if (symmetry == 1) - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = -fake_quant_max; - } - else - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = 0; - } - - int data_idx; - for (int i = 0; i < elem_channel; i++) - { - float scale = scale_list[i] * zoom; - for (int j = 0; j < elem_perchannel; j++) - { - data_idx = i*elem_perchannel + j; - if (scale_list[i] == 0) - data[data_idx] = 0; - else - { - data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; - data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; - data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; - data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; - } - } - } - -} -void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom) -{ - float* scale_list = (float *)weight_tensor->scale_list; - int* zero_point_list = (int *)weight_tensor->zp_list; - - int elem_perchannel = elem_num / elem_channel; - - float fake_quant_max; - float fake_quant_min; - - if (symmetry == 1) - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = -fake_quant_max; - } - else - { - fake_quant_max = pow(2,bitcount-symmetry) - 1; - fake_quant_min = 0; - } - - int data_idx; - for (int i = 0; i < elem_channel; i++) - { - float scale = 1; - if (zoom[i] > 5) - scale = scale_list[i]; - else - scale = scale_list[i] * zoom[i]; - for (int j = 0; j < elem_perchannel; j++) - { - data_idx = i*elem_perchannel + j; - if (scale_list[i] == 0) - data[data_idx] = 0; - else - { - data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; - data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; - data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; - data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; - } - } - } - -} - - -int QuantTool::quant_search() -{ - this->init(); - for (int i = 0; i < this->exec_node_num; i++) - { - this->get_exec_node_message(i); - this->check_for_interlearve(); - - this->out_imgs_fp32.resize(this->max_search_img_num); - this->out_imgs_fake_quant.resize(this->max_search_img_num); - - if (this->op_name == OP_CONV || this->op_name == OP_FC) - { - this->gen_weight_scale(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0]); - this->gen_weight_scale(this->weight_tensor_fp32, this->weight_data_fp32, this->weight_tensor_fp32->elem_num, 8, 1, weight_tensor_fp32->dims[0]); - - std::vector cosin_save(weight_tensor_fake_quant->dims[0], -1); - std::vector zoom_save(weight_tensor_fake_quant->dims[0], -1); - for (int snum = 0; snum < 201; snum=snum + 20) - { - float zoom = 1.3 / 200 * (snum + 1); -// float zoom = 1.0; - /* weight requant */ - if (snum < 200) - this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom); - else - { - this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom_save.data()); - float* buf = (float*)sys_malloc(weight_tensor_fake_quant->dims[0] * 4); - memcpy(buf, zoom_save.data(), weight_tensor_fake_quant->dims[0] * 4); -// printf(" scale3 %f \n",weight_tensor_fp32->scale_list[0]); - for ( int bi = 0; bi < weight_tensor_fake_quant->dims[0]; bi++ ) - { - buf[bi] *= weight_tensor_fp32->scale_list[bi]; - } -// printf(" scale4 %f \n",buf[0]); -// weight_tensor_fake_quant->scale_list = buf; - weight_tensor_fp32->scale_list = buf; - weight_tensor_fp32->quant_param_num = weight_tensor_fp32->dims[0]; -// printf(" scale5 %f \n",weight_tensor_fp32->scale_list[0]); - } - if (interleave_size_fake != 0) - { - int M = weight_tensor_fake_quant->dims[0]; - int K = weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; - this->conv_hcl_interleave_pack4_fp32(M, K, weight_data_fake_quant, interleave_buffer_fake_quant); - } - - /* bias requant */ - if (node_fake_quant->ir_node->input_num > 2) - { - struct tensor* input_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[0]]; - struct tensor* bias_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[2]]; - struct tensor* bias_tensor_fp32 = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[2]]; - - bias_size = bias_tensor_fp32->elem_num * bias_tensor_fp32->elem_size; - - bias_data_fp32 = (float*)bias_tensor_fp32->data; - bias_data_fake_quant = (float*)bias_tensor_fake_quant->data; - - this->bias_requant(input_tensor_fake_quant, weight_tensor_fake_quant, bias_tensor_fake_quant, - bias_data_fake_quant, bias_tensor_fake_quant->elem_num, bias_tensor_fake_quant->dims[0]); - } - - /* per image run */ - for (int imgi = 0; imgi < this->max_search_img_num; imgi++) - { - this->set_node_input_output_tensor(i, imgi, snum); - - /* FP32 op run */ - if (snum == 0) - { -// set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); - node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); - - this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num - this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size - this->execidx_nodename[i] = output_tensor_fp32->name; - } - - /* fake quant op run */ -// set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); - node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); - this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); - } // image number - - output_channel = output_tensor_fp32->dims[1]; - - if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1) ) - this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); - else - this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); - -// this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); - - for (int cosi = 0; cosi < output_channel; cosi++) - { - if (cosin[cosi] > cosin_save[cosi]) - { - cosin_save[cosi] = cosin[cosi]; - zoom_save[cosi] = zoom; - } - } - if (snum == 200) - { - if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) - this->print_cosin(this->cosin.data(), i, output_channel); - else - this->print_cosin(this->cosin.data(), i, 1); - } - - if (op_name == OP_CONV || op_name == OP_FC) - { - memcpy(weight_data_fake_quant, weight_data_fp32, weight_size); -// this->weight_correction(weight_data_fp32, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, this->bitcount, this->symmetry, weight_tensor_fake_quant->dims[0]); - memcpy(interleave_buffer_fake_quant, interleave_buffer_fp32, interleave_size_fake); - if (node_fake_quant->ir_node->input_num > 2) - { - memcpy(bias_data_fake_quant, bias_data_fp32, bias_size); - } - } - } - } - else - { - /* per image run */ - for (int imgi = 0; imgi < this->max_search_img_num; imgi++) - { - this->set_node_input_output_tensor(i, imgi, 0); - -// set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); - node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); - - /* fake quant op run */ -// set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); - node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); - this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); - - this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num - this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size - this->execidx_nodename[i] = output_tensor_fp32->name; - } - this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); - this->print_cosin(this->cosin.data(), i, 1); - this->execidx_loss[i] = cosin; - } - - this->free_used_layers(i); - - /* save node output */ - this->fp32_out.push_back(this->out_imgs_fp32); - this->fake_quant_out.push_back(this->out_imgs_fake_quant); - } // node number -// fprintf(stderr, "--------------------------------------\n"); - - if (!save_graph(graphn_fp32, "save_i8_eq.tmfile")) - { - fprintf(stderr, "save graph failed.\n"); - return -1; - } - - return 0; -} - +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "../quant_tool.hpp" + +int QuantTool::init() +{ + // ir graph variable + this->fp32_out.clear(); + this->fake_quant_out.clear(); + + /* load fp32 graph and fake quant graph */ + this->graphn_fp32 = (struct graph*)create_graph(nullptr, "tengine", this->model_file.c_str()); + this->graphn_fake_quant = (struct graph*)create_graph(nullptr, "tengine", this->model_file.c_str()); + + if (this->graphn_fp32 == nullptr || this->graphn_fake_quant == nullptr) + { + fprintf(stderr, "Create graph failed.\n"); + fprintf(stderr, "errno: %d \n", get_tengine_errno()); + return -1; + } + + /* load activation scale to ir_tensor */ + this->load_activation_scale(this->graphn_fp32, this->scale_file.c_str(), this->inplace); + this->load_activation_scale(this->graphn_fake_quant, this->scale_file.c_str(), this->inplace); + + /* get graph input tensor */ + this->graph_input_tensor_fp32 = (struct tensor*)get_graph_input_tensor((void*)this->graphn_fp32, 0, 0); + this->graph_input_tensor_fake_quant = (struct tensor*)get_graph_input_tensor((void*)this->graphn_fake_quant, 0, 0); + if (this->graph_input_tensor_fp32 == nullptr || this->graph_input_tensor_fake_quant == nullptr) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + /* generate images list */ + std::vector imgs_list; + if (!this->image_dir.empty()) + readFileList(this->image_dir, imgs_list); + uint32_t img_num = imgs_list.size(); + + this->max_search_img_num = 50; + if (img_num < this->max_search_img_num) + this->max_search_img_num = img_num; + + // fprintf(stderr, "# eq dataset num %d\n", this->max_search_img_num); + + /* set the shape, data buffer of input_tensor of the graph */ + this->img_size = this->img_h * this->img_w * this->img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data_fp32 = (float*)malloc(this->img_size * sizeof(float)); + float* input_data_fake_quant = (float*)malloc(this->img_size * sizeof(float)); + + /* prepare process input data, set the data mem to input tensor */ + float scale_graph_input = this->graph_input_tensor_fake_quant->scale; + int zero_point_graph_input = this->graph_input_tensor_fake_quant->zero_point; + // fprintf(stderr, "scale zp %f %d\n", scale_graph_input, zero_point_graph_input); + + this->input_datas_fp32.resize(this->max_search_img_num); + this->input_datas_fake_quant.resize(this->max_search_img_num); + + for (int i = 0; i < this->max_search_img_num; i++) + { + this->input_datas_fp32[i].resize(this->img_size); + this->input_datas_fake_quant[i].resize(this->img_size); + + get_input_data_cv(imgs_list[i].c_str(), this->input_datas_fp32[i].data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + this->input_datas_fake_quant[i] = this->input_datas_fp32[i]; + this->activation_requant(this->input_datas_fake_quant[i].data(), this->img_size, 8, 1, scale_graph_input, + zero_point_graph_input); + } + + /* set graph input shape */ + int ret_fp32 = set_tensor_shape(this->graph_input_tensor_fp32, dims, 4); + int ret_fake_quant = set_tensor_shape(this->graph_input_tensor_fake_quant, dims, 4); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + /* set graph input buffer */ + ret_fp32 = set_tensor_buffer(this->graph_input_tensor_fp32, input_data_fp32, this->img_size * 4); + ret_fake_quant = set_tensor_buffer(this->graph_input_tensor_fake_quant, input_data_fake_quant, this->img_size * 4); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread((void*)this->graphn_fp32, this->opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + ret_fp32 = prerun_graph_multithread((void*)this->graphn_fp32, this->opt); + ret_fake_quant = prerun_graph_multithread((void*)this->graphn_fake_quant, this->opt); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + /* get exec graph */ + this->exec_graph_fp32 = this->get_exec_graph(this->graphn_fp32); + this->exec_graph_fake_quant = this->get_exec_graph(this->graphn_fake_quant); + this->exec_node_num = get_vector_num(this->exec_graph_fp32->exec_node_list); + + /* ir idx <<<->>> exec idx */ + for (int i = 0; i < this->exec_node_num; i++) + { + this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, i); + this->node_fake_quant = (struct exec_node*)get_vector_data(this->exec_graph_fake_quant->exec_node_list, i); + + int out_t = node_fp32->ir_node->output_tensors[0]; + this->ir_exec[graphn_fp32->tensor_list[out_t]->producer] = i; // ir idx --> exec idx + this->exec_ir[i] = graphn_fp32->tensor_list[out_t]->producer; // exec idx --> ir idx + // printf(" %d : %d\n", graphn_fp32->tensor_list[out_t]->producer, i); + } + + /* check for free node*/ + this->check_for_free(); + + return 0; +} + +void QuantTool::activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point) +{ + // symmetry = 0; + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + for (int i = 0; i < elem_num; i++) + { + data[i] = round(data[i] / scale) + zero_point; + data[i] = data[i] > fake_quant_max ? fake_quant_max : data[i]; + data[i] = data[i] < fake_quant_min ? fake_quant_min : data[i]; + data[i] = (data[i] - zero_point) * scale; + } +} + +void QuantTool::recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, + dict_str2int& layer_used, dict_str2float& layer_scale, dict_str2float& layer_zeropoint, dict_str2int& layer_pass) +{ + if (layer_pass[t->name] == 0 && layer_used[t->name] < 2) + { + t->scale = layer_scale[layer_name]; + t->zero_point = layer_zeropoint[layer_name]; + layer_scale[t->name] = layer_scale[layer_name]; + layer_zeropoint[t->name] = layer_zeropoint[layer_name]; + + uint32_t ir_node_idx = t->producer; + struct node* t_node = graphn->node_list[ir_node_idx]; + + auto op_name = t_node->op.type; + bool poolTrue = false; + bool reluTrue = false; + if (op_name == OP_POOL) + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == OP_RELU) + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) + { + QuantTool::recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + layer_pass[t->name] = 1; + } +} + +struct exec_graph* QuantTool::get_exec_graph(struct graph* graphn) +{ + struct subgraph* subgraph = get_ir_graph_subgraph(graphn, 0); + struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph; + + return exec_graph; +} + +void QuantTool::check_for_free() +{ + dict_uint2uint nodeA2B; + for (int i = 0; i < this->exec_node_num; i++) + { + this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, i); + this->op_name = this->node_fp32->ir_node->op.type; + + for (int j = 0; j < this->node_fp32->ir_node->input_num; j++) + { + struct tensor* t = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[j]]; + if (t->tensor_type == 1) + { + uint32_t ir_idx = t->producer; + nodeA2B[this->ir_exec[ir_idx]] = i; + } + } + } + + for (auto iter = nodeA2B.begin(); iter != nodeA2B.end(); iter++) + { + this->dict_free[iter->second].push_back(iter->first); + // printf(" map %d %d\n", iter->first, iter->second); + } +} + +void QuantTool::check_for_interlearve() +{ + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + /* get weight tensor */ + this->weight_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[1]]; + this->weight_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[1]]; + this->weight_size = this->weight_tensor_fp32->elem_num * this->weight_tensor_fp32->elem_size; + + this->weight_data_fp32 = (float*)this->weight_tensor_fp32->data; + this->weight_data_fake_quant = (float*)this->weight_tensor_fake_quant->data; + + if (this->op_name == OP_CONV) + { + this->conv_param_fp32 = (struct conv_param*)this->node_fp32->ir_node->op.param_mem; + this->conv_param_fake_quant = (struct conv_param*)this->node_fake_quant->ir_node->op.param_mem; + + if (this->conv_param_fp32->group != this->conv_param_fp32->output_channel) + { + this->conv_priv_info_fp32 = (struct conv_priv_info*)this->node_fp32->ops_priv; + this->conv_priv_info_fake_quant = (struct conv_priv_info*)this->node_fake_quant->ops_priv; + + this->interleave_size_fake = this->conv_priv_info_fp32->interleave_buffer_pack4_size; + + this->interleave_buffer_fp32 = (float*)this->conv_priv_info_fp32->interleave_buffer_pack4; + this->interleave_buffer_fake_quant = (float*)this->conv_priv_info_fake_quant->interleave_buffer_pack4; + } + } + else + this->interleave_size_fake = 0; + } +} + +void QuantTool::weight_bias_requant(int search) +{ + /* weight requant */ + // printf("### 1.1 this->weight_tensor_fake_quant->scale %f\n",this->weight_tensor_fake_quant->scale); + if (0 == search) + this->weight_requant(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, this->weight_tensor_fake_quant->dims[0]); + + if (this->interleave_size_fake != 0) + { + int M = this->weight_tensor_fake_quant->dims[0]; + int K = this->weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; + this->conv_hcl_interleave_pack4_fp32(M, K, this->weight_data_fake_quant, this->interleave_buffer_fake_quant); + } + + /* bias requant */ + if (this->node_fake_quant->ir_node->input_num > 2) + { + this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[0]]; + this->bias_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[2]]; + this->bias_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[2]]; + this->bias_size = this->bias_tensor_fp32->elem_num * this->bias_tensor_fp32->elem_size; + this->bias_data_fp32 = (float*)this->bias_tensor_fp32->data; + this->bias_data_fake_quant = (float*)this->bias_tensor_fake_quant->data; + this->bias_requant(this->input_tensor_fake_quant, this->weight_tensor_fake_quant, this->bias_tensor_fake_quant, + this->bias_data_fake_quant, this->bias_tensor_fake_quant->elem_num, this->bias_tensor_fake_quant->dims[0]); + // this->bias_tensor_fp32->scale = this->bias_tensor_fake_quant->scale; + } +} + +void QuantTool::set_node_input_output_tensor(int idx, int imgi, int snum) +{ + this->out_imgs_fp32[imgi].resize(this->output_tensor_fp32->elem_num); + this->out_imgs_fake_quant[imgi].resize(this->output_tensor_fp32->elem_num); + + if (idx == 0) + { + set_tensor_buffer(this->graph_input_tensor_fp32, this->input_datas_fp32[imgi].data(), this->img_size * 4); + set_tensor_buffer(this->graph_input_tensor_fake_quant, this->input_datas_fake_quant[imgi].data(), this->img_size * 4); + } + else + { + for (int inputi = 0; inputi < this->node_fp32->ir_node->input_num; inputi++) + { + uint32_t ir_input_tensor_idx = this->node_fp32->ir_node->input_tensors[inputi]; + this->input_tensor_fp32 = this->graphn_fp32->tensor_list[ir_input_tensor_idx]; + this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[ir_input_tensor_idx]; + + if (this->input_tensor_fp32->tensor_type == 1) + { + uint32_t ir_node_idx = this->input_tensor_fp32->producer; + uint32_t input_size = this->input_tensor_fp32->elem_num * input_tensor_fp32->elem_size; + + uint32_t exec_node_idx = this->ir_exec[ir_node_idx]; + + if (imgi == 0 && snum == 0) + { + float* buf_fp32 = (float*)sys_malloc(32); + float* buf_fake_quant = (float*)sys_malloc(32); + + set_tensor_buffer(this->input_tensor_fp32, buf_fp32, input_size); + set_tensor_buffer(this->input_tensor_fake_quant, buf_fake_quant, input_size); + + set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); + set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); + } + else + { + set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); + set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); + } + } // output tensor + } // node input number + } // node i > 0 + + /* init output buffer */ + set_tensor_buffer(this->output_tensor_fp32, this->out_imgs_fp32[imgi].data(), this->output_tensor_fp32->elem_num * this->output_tensor_fp32->elem_size); + set_tensor_buffer(this->output_tensor_fake_quant, this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num * this->output_tensor_fake_quant->elem_size); +} + +double QuantTool::cosin_similarity(std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num) +{ + double norm_a = 0; + double norm_b = 0; + double a_b = 0; + + uint32_t fnum = (output_num >> 4) << 4; + uint32_t rnum = output_num - fnum; + +#if 0 //__AVX__ + + float _sumaa0[8] = {0.f}; + float _sumbb0[8] = {0.f}; + float _sumaabb0[8] = {0.f}; + float _sumaa1[8] = {0.f}; + float _sumbb1[8] = {0.f}; + float _sumaabb1[8] = {0.f}; + + __m256 _suma_o0 = _mm256_set1_ps(0.0); + __m256 _sumb_o0 = _mm256_set1_ps(0.0); + __m256 _sumab_o0 = _mm256_set1_ps(0.0); + __m256 _suma_o1 = _mm256_set1_ps(0.0); + __m256 _sumb_o1 = _mm256_set1_ps(0.0); + __m256 _sumab_o1 = _mm256_set1_ps(0.0); + + for (int i = 0; i < imgs_num; i++) + { + const float* in_a_addr = in_a[i].data(); + const float* in_b_addr = in_b[i].data(); + for (int j = 0; j < fnum; j=j+32) + { + __m256 _in_a0 = _mm256_loadu_ps(in_a_addr+j); + __m256 _in_b0 = _mm256_loadu_ps(in_b_addr+j); + __m256 _in_a1 = _mm256_loadu_ps(in_a_addr+j+8); + __m256 _in_b1 = _mm256_loadu_ps(in_b_addr+j+8); + + _suma_o0 = _mm256_fmadd_ps(_in_a0, _in_a0, _suma_o0); + _sumb_o0 = _mm256_fmadd_ps(_in_b0, _in_b0, _sumb_o0); + _sumab_o0 = _mm256_fmadd_ps(_in_a0, _in_b0, _sumab_o0); + _suma_o1 = _mm256_fmadd_ps(_in_a1, _in_a1, _suma_o1); + _sumb_o1 = _mm256_fmadd_ps(_in_b1, _in_b1, _sumb_o1); + _sumab_o1 = _mm256_fmadd_ps(_in_a1, _in_b1, _sumab_o1); + } + } + _mm256_storeu_ps(_sumaa0, _suma_o0); + _mm256_storeu_ps(_sumbb0, _sumb_o0); + _mm256_storeu_ps(_sumaabb0, _sumab_o0); + _mm256_storeu_ps(_sumaa1, _suma_o1); + _mm256_storeu_ps(_sumbb1, _sumb_o1); + _mm256_storeu_ps(_sumaabb1, _sumab_o1); + + for (int i = 0; i < 8; i++) + { + norm_a += _sumaa0[i] + _sumaa1[i]; + norm_b += _sumbb0[i] + _sumbb1[i]; + a_b += _sumaabb0[i] + _sumaabb1[i]; + + } + +#else // normal + // printf("AAAA DIRECT\n"); + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < fnum; j = j + 8) + { + for (int k = 0; k < 8; k = k + 1) + { + norm_a += in_a[i][j + k] * in_a[i][j + k]; + + norm_b += in_b[i][j + k] * in_b[i][j + k]; + + a_b += in_a[i][j + k] * in_b[i][j + k]; + } + } + } + +#endif // __SSE__ __AVX__ + + for (int j = fnum; j < output_num; j++) + { + for (int i = 0; i < imgs_num; i++) + { + norm_a += in_a[i][j] * in_a[i][j]; + norm_b += in_b[i][j] * in_b[i][j]; + a_b += in_a[i][j] * in_b[i][j]; + } + } + + double cosin = 0.0; + double _a_b_ = sqrt(norm_a) * sqrt(norm_b); + if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin = a_b; + else + cosin = a_b / _a_b_; + if (cosin < -999999 || cosin > 999999) + cosin = 0; + return cosin; +} + +double QuantTool::cosin_similarity(std::vector* in_a, std::vector* in_b, uint32_t imgs_num, uint32_t output_num) +{ + uint32_t output_channel = 1; + std::vector norm_a(output_channel, 0.0); + std::vector norm_b(output_channel, 0.0); + std::vector a_b(output_channel, 0.0); + + int elem_perchannel = int(output_num / output_channel); + + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < output_channel; j++) + { + for (int k = 0; k < elem_perchannel; k++) + { + int elem_idx = j * elem_perchannel + k; + norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; + norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; + a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; + } + } + } + + double cosin; + for (int j = 0; j < output_channel; j++) + { + double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); + if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin = a_b[j]; + else + cosin = a_b[j] / _a_b_; + if (cosin < -999999 || cosin > 999999) + cosin = 0; + } + return cosin; +} + +void QuantTool::weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) +{ + float* scale_list = (float*)sys_malloc(elem_channel * 4); + int* zero_point_list = (int*)sys_malloc(elem_channel * 4); + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + float scale = 1; + int zero_point = 0; + for (int c = 0; c < elem_channel; c++) + { + float weight_max = *std::max_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + float weight_min = *std::min_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + if (symmetry == 1) + { + if (abs(weight_max) > abs(weight_min)) + scale = abs(weight_max) / fake_quant_max; + else + scale = abs(weight_min) / fake_quant_max; + zero_point = 0; + } + else + { + scale = (weight_max - weight_min) / fake_quant_max; + zero_point = int(-weight_min / scale); + } + + scale_list[c] = scale; + zero_point_list[c] = zero_point; + } + + if (weight_tensor->scale_list == NULL) + { + // printf(" EMPTY\n "); + weight_tensor->scale_list = scale_list; + weight_tensor->zp_list = zero_point_list; + } + else + { + scale_list = weight_tensor->scale_list; + zero_point_list = weight_tensor->zp_list; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale_list[i]) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale_list[i]; + } + } + } +} + +void QuantTool::conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t) +{ + int nn_outch = M >> 3; + int remain_outch_start = nn_outch << 3; + + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + const float* k0 = pA + (p + 0) * K; + const float* k1 = pA + (p + 1) * K; + const float* k2 = pA + (p + 2) * K; + const float* k3 = pA + (p + 3) * K; + const float* k4 = pA + (p + 4) * K; + const float* k5 = pA + (p + 5) * K; + const float* k6 = pA + (p + 6) * K; + const float* k7 = pA + (p + 7) * K; + + float* ktmp = pA_t + (p / 8) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp[1] = k1[0]; + ktmp[2] = k2[0]; + ktmp[3] = k3[0]; + ktmp[4] = k4[0]; + ktmp[5] = k5[0]; + ktmp[6] = k6[0]; + ktmp[7] = k7[0]; + ktmp += 8; + + k0 += 1; + k1 += 1; + k2 += 1; + k3 += 1; + k4 += 1; + k5 += 1; + k6 += 1; + k7 += 1; + } + } + + nn_outch = (M - remain_outch_start) >> 2; + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + const float* k0 = pA + (p + 0) * K; + const float* k1 = pA + (p + 1) * K; + const float* k2 = pA + (p + 2) * K; + const float* k3 = pA + (p + 3) * K; + + float* ktmp = pA_t + (p / 8 + (p % 8) / 4) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp[1] = k1[0]; + ktmp[2] = k2[0]; + ktmp[3] = k3[0]; + ktmp += 4; + + k0 += 1; + k1 += 1; + k2 += 1; + k3 += 1; + } + } + + remain_outch_start += nn_outch << 2; + + for (int p = remain_outch_start; p < M; p++) + { + const float* k0 = pA + (p + 0) * K; + + float* ktmp = pA_t + (p / 8 + (p % 8) / 4 + p % 4) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp++; + k0++; + } + } +} + +void QuantTool::gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) +{ + float* scale_list = (float*)sys_malloc(elem_channel * 4); + int* zero_point_list = (int*)sys_malloc(elem_channel * 4); + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + float scale = 1; + int zero_point = 0; + for (int c = 0; c < elem_channel; c++) + { + float weight_max = *std::max_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + float weight_min = *std::min_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + if (symmetry == 1) + { + if (abs(weight_max) > abs(weight_min)) + scale = abs(weight_max) / fake_quant_max; + else + scale = abs(weight_min) / fake_quant_max; + zero_point = 0; + } + else + { + scale = (weight_max - weight_min) / fake_quant_max; + zero_point = int(-weight_min / scale); + } + + scale_list[c] = scale; + zero_point_list[c] = zero_point; + } + + weight_tensor->scale_list = scale_list; + weight_tensor->zp_list = zero_point_list; +} + +void QuantTool::bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, + float* data, int elem_num, int elem_channel) +{ + int elem_perchannel = elem_num / elem_channel; + float* scale_list = (float*)sys_malloc(elem_channel * 4); + + for (int c = 0; c < elem_channel; c++) + { + float input_scale = input_tensor->scale; + float weight_scale = weight_tensor->scale_list[c]; + float bias_scale = input_scale * weight_scale; + scale_list[c] = bias_scale; + } + + bias_tensor->scale_list = scale_list; + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + { + data[data_idx] = 0; + } + else + { + data[data_idx] = round(data[data_idx] / scale_list[i]); + data[data_idx] = data[data_idx] * scale_list[i]; + } + } + } +} + +void QuantTool::weight_bias_reset() +{ + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + std::memcpy(this->weight_data_fake_quant, this->weight_data_fp32, this->weight_size); + std::memcpy(this->interleave_buffer_fake_quant, this->interleave_buffer_fp32, this->interleave_size_fake); + if (this->node_fake_quant->ir_node->input_num > 2) + { + memcpy(this->bias_data_fake_quant, this->bias_data_fp32, this->bias_size); + } + } +} + +void QuantTool::free_used_layers(int idx) +{ + // printf("#### free 0 idx %d\n",idx); + if (this->dict_free[idx].size() > 0) + { + // printf("#### free 1 idx %d\n",idx); + std::vector > freen_fp32; + std::vector > freen_fake_quant; + for (int fi = 0; fi < this->dict_free[idx].size(); fi++) + { + if (this->dict_free[idx][fi] != 0) + { + // printf("---free---\n"); + this->fp32_out[this->dict_free[idx][fi]].clear(); + this->fake_quant_out[this->dict_free[idx][fi]].clear(); + } + } + } +} + +void QuantTool::load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc) +{ + std::unordered_map layer_scale; + std::unordered_map layer_zeropoint; + bool parse_from_file = false; + if (nullptr != scale_file) + { + std::ifstream scales(scale_file); + std::string line; + while (std::getline(scales, line)) + { + std::string layer_name; + float scale_val = 0.f; + float zero_point = 0.f; + size_t last = 0; + size_t index = line.find_first_of(" ", last); + size_t idx = line.find_last_of(" ", line.size()); + layer_name = line.substr(last, index - last); + // printf("layer_name : %s \n", layer_name.c_str()); + last = index + 1; + scale_val = atof((line.substr(last, line.size() - last)).c_str()); + zero_point = atof((line.substr(idx + 1, line.size())).c_str()); + + layer_scale[layer_name] = scale_val; + layer_zeropoint[layer_name] = zero_point; + // fprintf(stderr, "quant value : %s %f %f \n", layer_name.c_str(), scale_val, zero_point); + } + } + + std::unordered_map layer_used; + for (int i = 0; i < graphn->node_num; i++) + { + struct node* noden = graphn->node_list[i]; + for (int j = 0; j < noden->input_num; j++) + { + std::string layern = graphn->tensor_list[noden->input_tensors[j]]->name; + layer_used[layern]++; + } + } + + if (mode_sc == 0) + { + for (int i = 0; i < graphn->tensor_num; i++) + { + struct tensor* t = graphn->tensor_list[i]; + if (t->tensor_type == 1 || t->tensor_type == 3) + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + } + } + } + else + { + std::unordered_map layer_pass; + for (int i = graphn->tensor_num - 1; i >= 0; i--) + { + struct tensor* t = graphn->tensor_list[i]; + if (t->tensor_type == 1 || t->tensor_type == 3) + { + if (layer_pass[t->name] == 0) + { + uint32_t ir_node_idx = t->producer; + struct node* t_node = graphn->node_list[ir_node_idx]; + + auto op_name = t_node->op.type; + + bool poolTrue = false; + bool reluTrue = false; + if (op_name == OP_POOL) + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == OP_RELU) + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + + if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + + if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) + { + this->recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, + layer_zeropoint, layer_pass); + } + } + } + else + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + } + layer_pass[t->name] = 1; + } + } + } + } + + // for (int i = 0; i < graphn->tensor_num; i++) + // { + // struct ir_tensor* t = graphn->tensor_list[i]; + // if (t->tensor_type == 1 || t->tensor_type == 3) + // { + // printf(" sz %s %f %d \n",t->name, t->scale, t->zero_point); + // } + // } +} + +int QuantTool::get_exec_node_message(int exec_node_idx) +{ + /* get node */ + this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, exec_node_idx); + this->node_fake_quant = (struct exec_node*)get_vector_data(this->exec_graph_fake_quant->exec_node_list, exec_node_idx); + + /* get op type */ + this->op_name = this->node_fp32->ir_node->op.type; + + /* get exec ops */ + this->node_ops_fp32 = this->node_fp32->node_ops; + this->node_ops_fake_quant = this->node_fake_quant->node_ops; + + /* handle the shape changed and dynamic shape case */ + if (this->node_ops_fp32->reshape && this->node_ops_fp32->reshape(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32) + && this->node_ops_fake_quant->reshape && this->node_ops_fake_quant->reshape(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant) < 0) + { + TLOG_ERR("failed to reshape node %d, %s\n", node_fp32->ir_node->index, node_fp32->ir_node->name); + return -1; + } + + /* get output tensor */ + this->output_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->output_tensors[0]]; + this->output_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->output_tensors[0]]; + + /* get exec ops */ + this->execidx_elemnum[exec_node_idx] = this->output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[exec_node_idx] = this->output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[exec_node_idx] = this->output_tensor_fp32->name; //exec idx --> output tensor name + + return 0; +} + +void QuantTool::cosin_similarity(std::vector& cosin, std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel) // cosin dis perchannel +{ + // fprintf(stderr, " in_a %f ",in_a[0][0]); + // fprintf(stderr, " in_b %f ",in_b[0][0]); + + std::vector norm_a(output_channel, 0.0); + std::vector norm_b(output_channel, 0.0); + std::vector a_b(output_channel, 0.0); + + int elem_perchannel = int(output_num / output_channel); + + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < output_channel; j++) + { + for (int k = 0; k < elem_perchannel; k++) + { + int elem_idx = j * elem_perchannel + k; + norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; + norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; + a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; + } + } + } + + cosin.resize(output_channel); + for (int j = 0; j < output_channel; j++) + { + double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); + // fprintf(stderr, " %lf %f %f \n ", _a_b_, sqrt(norm_a[j]), sqrt(norm_b[j]) ); + if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin[j] = a_b[j]; + else + cosin[j] = a_b[j] / _a_b_; + if (cosin[j] < -999999 || cosin[j] > 999999) + cosin[j] = 0; + } +} + +int QuantTool::assess_quant_loss(int gen) +{ + this->init(); + for (int i = 0; i < this->exec_node_num; i++) + { + this->get_exec_node_message(i); + this->check_for_interlearve(); + + this->out_imgs_fp32.resize(this->max_search_img_num); + this->out_imgs_fake_quant.resize(this->max_search_img_num); + if (this->op_name == OP_CONV || this->op_name == OP_FC) + this->weight_bias_requant(gen); + + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, 0); + + /* op run */ + this->node_ops_fp32->run(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32); + this->node_ops_fake_quant->run(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant); + this->activation_requant(this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num, 8, 1, this->output_tensor_fake_quant->scale, this->output_tensor_fake_quant->zero_point); + } + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], this->weight_tensor_fp32->dims[0]); + else + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->print_cosin(this->cosin.data(), i, this->weight_tensor_fp32->dims[0]); + else + this->print_cosin(this->cosin.data(), i, 1); + // fprintf(stderr, "cosin [%s] : %f\n", execidx_nodename[i].c_str(), cosin); + + this->weight_bias_reset(); + this->free_used_layers(i); + + /* save node output */ + this->fp32_out.push_back(this->out_imgs_fp32); + this->fake_quant_out.push_back(this->out_imgs_fake_quant); + } + + return 0; +} + +void QuantTool::print_cosin(double* cosin, int idx, int output_channel) +{ + float avg_cosin = 0; + float avg_num = 0; + for (int c = 0; c < output_channel; c++) + { + if (cosin[c] != 0) + { + avg_cosin += cosin[c]; + avg_num++; + } + } + fprintf(stderr, "cosin %3d %4d avg %0.6f ### ", idx, output_channel, avg_cosin / avg_num); + for (int c = 0; c < output_channel; c++) + { + fprintf(stderr, "%0.6f ", cosin[c]); + } + fprintf(stderr, "\n"); +} + +void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom) +{ + float* scale_list = (float*)weight_tensor->scale_list; + int* zero_point_list = (int*)weight_tensor->zp_list; + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + float scale = scale_list[i] * zoom; + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; + } + } + } +} +void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom) +{ + float* scale_list = (float*)weight_tensor->scale_list; + int* zero_point_list = (int*)weight_tensor->zp_list; + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + float scale = 1; + if (zoom[i] > 5) + scale = scale_list[i]; + else + scale = scale_list[i] * zoom[i]; + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; + } + } + } +} + +int QuantTool::quant_search() +{ + this->init(); + for (int i = 0; i < this->exec_node_num; i++) + { + this->get_exec_node_message(i); + this->check_for_interlearve(); + + this->out_imgs_fp32.resize(this->max_search_img_num); + this->out_imgs_fake_quant.resize(this->max_search_img_num); + + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + this->gen_weight_scale(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0]); + this->gen_weight_scale(this->weight_tensor_fp32, this->weight_data_fp32, this->weight_tensor_fp32->elem_num, 8, 1, weight_tensor_fp32->dims[0]); + + std::vector cosin_save(weight_tensor_fake_quant->dims[0], -1); + std::vector zoom_save(weight_tensor_fake_quant->dims[0], -1); + for (int snum = 0; snum < 201; snum = snum + 20) + { + float zoom = 1.3 / 200 * (snum + 1); + // float zoom = 1.0; + /* weight requant */ + if (snum < 200) + this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom); + else + { + this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom_save.data()); + float* buf = (float*)sys_malloc(weight_tensor_fake_quant->dims[0] * 4); + memcpy(buf, zoom_save.data(), weight_tensor_fake_quant->dims[0] * 4); + // printf(" scale3 %f \n",weight_tensor_fp32->scale_list[0]); + for (int bi = 0; bi < weight_tensor_fake_quant->dims[0]; bi++) + { + buf[bi] *= weight_tensor_fp32->scale_list[bi]; + } + // printf(" scale4 %f \n",buf[0]); + // weight_tensor_fake_quant->scale_list = buf; + weight_tensor_fp32->scale_list = buf; + weight_tensor_fp32->quant_param_num = weight_tensor_fp32->dims[0]; + // printf(" scale5 %f \n",weight_tensor_fp32->scale_list[0]); + } + if (interleave_size_fake != 0) + { + int M = weight_tensor_fake_quant->dims[0]; + int K = weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; + this->conv_hcl_interleave_pack4_fp32(M, K, weight_data_fake_quant, interleave_buffer_fake_quant); + } + + /* bias requant */ + if (node_fake_quant->ir_node->input_num > 2) + { + struct tensor* input_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[0]]; + struct tensor* bias_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[2]]; + struct tensor* bias_tensor_fp32 = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[2]]; + + bias_size = bias_tensor_fp32->elem_num * bias_tensor_fp32->elem_size; + + bias_data_fp32 = (float*)bias_tensor_fp32->data; + bias_data_fake_quant = (float*)bias_tensor_fake_quant->data; + + this->bias_requant(input_tensor_fake_quant, weight_tensor_fake_quant, bias_tensor_fake_quant, + bias_data_fake_quant, bias_tensor_fake_quant->elem_num, bias_tensor_fake_quant->dims[0]); + } + + /* per image run */ + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, snum); + + /* FP32 op run */ + if (snum == 0) + { + // set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); + node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); + + this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[i] = output_tensor_fp32->name; + } + + /* fake quant op run */ + // set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); + node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); + this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); + } // image number + + output_channel = output_tensor_fp32->dims[1]; + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); + else + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + + // this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); + + for (int cosi = 0; cosi < output_channel; cosi++) + { + if (cosin[cosi] > cosin_save[cosi]) + { + cosin_save[cosi] = cosin[cosi]; + zoom_save[cosi] = zoom; + } + } + if (snum == 200) + { + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->print_cosin(this->cosin.data(), i, output_channel); + else + this->print_cosin(this->cosin.data(), i, 1); + } + + if (op_name == OP_CONV || op_name == OP_FC) + { + memcpy(weight_data_fake_quant, weight_data_fp32, weight_size); + // this->weight_correction(weight_data_fp32, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, this->bitcount, this->symmetry, weight_tensor_fake_quant->dims[0]); + memcpy(interleave_buffer_fake_quant, interleave_buffer_fp32, interleave_size_fake); + if (node_fake_quant->ir_node->input_num > 2) + { + memcpy(bias_data_fake_quant, bias_data_fp32, bias_size); + } + } + } + } + else + { + /* per image run */ + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, 0); + + // set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); + node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); + + /* fake quant op run */ + // set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); + node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); + this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); + + this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[i] = output_tensor_fp32->name; + } + this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + this->print_cosin(this->cosin.data(), i, 1); + this->execidx_loss[i] = cosin; + } + + this->free_used_layers(i); + + /* save node output */ + this->fp32_out.push_back(this->out_imgs_fp32); + this->fake_quant_out.push_back(this->out_imgs_fake_quant); + } // node number + // fprintf(stderr, "--------------------------------------\n"); + + if (!save_graph(graphn_fp32, "save_i8_eq.tmfile")) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + return 0; +} diff --git a/tools/quantize/quant_tool.hpp b/tools/quantize/quant_tool.hpp index c413eaad0..99b43beba 100644 --- a/tools/quantize/quant_tool.hpp +++ b/tools/quantize/quant_tool.hpp @@ -61,7 +61,6 @@ typedef std::unordered_map > dict_uint2vecuint; typedef std::unordered_map dict_uint2str; typedef std::unordered_map > dict_uint2doublex; - #define ALGORITHM_MIN_MAX 0 #define ALGORITHM_KL 1 #define ALGORITHM_ACIQ 2 @@ -89,8 +88,8 @@ class QuantTool private: void recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, - dict_str2int &layer_used, dict_str2float &layer_scale, - dict_str2float &layer_zeropoint, dict_str2int &layer_pass); + dict_str2int& layer_used, dict_str2float& layer_scale, + dict_str2float& layer_zeropoint, dict_str2int& layer_pass); struct exec_graph* get_exec_graph(struct graph* graphn); void load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc); @@ -100,16 +99,16 @@ class QuantTool void check_for_interlearve(); void weight_bias_requant(int search); void conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t); - void activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point=0); + void activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point = 0); void weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel); void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom); void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom); void bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, float* data, int elem_num, int elem_channel); void set_node_input_output_tensor(int idx, int imgi, int snum); - double cosin_similarity(std::vector* in_a,std::vector* in_b, uint32_t imgs_num, uint32_t output_num); - double cosin_similarity(std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num); - void cosin_similarity(std::vector &cosin, std::vector > &in_a,std::vector > &in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel); // cosin dis perchannel + double cosin_similarity(std::vector* in_a, std::vector* in_b, uint32_t imgs_num, uint32_t output_num); + double cosin_similarity(std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num); + void cosin_similarity(std::vector& cosin, std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel); // cosin dis perchannel void weight_bias_reset(); void free_used_layers(int idx); void gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel); @@ -130,9 +129,9 @@ class QuantTool int img_c; int img_h; int img_w; - float mean[3]; // value of mean (mean value, default is 104.0,117.0,123.0) - float scale[3]; // value of normalize (scale value, default is 1.0,1.0,1.0) - int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0) + float mean[3]; // value of mean (mean value, default is 104.0,117.0,123.0) + float scale[3]; // value of normalize (scale value, default is 1.0,1.0,1.0) + int center_crop; // flag which indicates that center crop process image is necessary(0:OFF, 1:ON, default is 0) int letterbox_rows; int letterbox_cols; int sw_RGB; // flag which indicates that swap first and last channels in 3-channel image is necessary(0:OFF, 1:ON, default is 1) @@ -142,13 +141,13 @@ class QuantTool bool evaluate; // evaluate quantitative losses private: // system variable - dict_uint2uint ir_exec; - dict_uint2uint exec_ir; - dict_uint2vecuint dict_free; - dict_uint2uint execidx_elemnum; - dict_uint2uint execidx_elemsize; - dict_uint2str execidx_nodename; - dict_uint2doublex execidx_loss; + dict_uint2uint ir_exec; + dict_uint2uint exec_ir; + dict_uint2vecuint dict_free; + dict_uint2uint execidx_elemnum; + dict_uint2uint execidx_elemsize; + dict_uint2str execidx_nodename; + dict_uint2doublex execidx_loss; int max_search_img_num; @@ -169,8 +168,8 @@ class QuantTool struct graph* graphn_fp32; struct graph* graphn_fake_quant; - struct tensor* graph_input_tensor_fp32; - struct tensor* graph_input_tensor_fake_quant; + struct tensor* graph_input_tensor_fp32; + struct tensor* graph_input_tensor_fake_quant; struct exec_graph* exec_graph_fp32; struct exec_graph* exec_graph_fake_quant; int exec_node_num; diff --git a/tools/quantize/quant_tool_int8.cpp b/tools/quantize/quant_tool_int8.cpp index 859840b50..3c63ebf0b 100644 --- a/tools/quantize/quant_tool_int8.cpp +++ b/tools/quantize/quant_tool_int8.cpp @@ -174,7 +174,7 @@ int QuantTool::activation_quant_tool() act_tensor_num++; max_activation[i] = -FLT_MAX; min_activation[i] = FLT_MAX; - act_map[act_tensor_num-1] = i; + act_map[act_tensor_num - 1] = i; } } @@ -292,7 +292,7 @@ int QuantTool::activation_quant_tool() fprintf(stderr, " threshold_bin %d \n", threshold_bin); float act_scale = hist_edge[i][threshold_bin] / fake_quant_set; - int act_zero_point = 0; + int act_zero_point = 0; /* the scale of softmax always is scale = 1 / 127.f */ for (int j = 0; j < ir_graph->node_num; j++) @@ -431,7 +431,7 @@ int QuantTool::activation_quant_tool() fprintf(stderr, "\r\n[Quant Tools Info]: Step 2, find original calibration minmax threshold table done, output ./table_minmax.scale\n"); } -// fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); + // fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); /* release tengine */ postrun_graph(ir_graph); @@ -574,102 +574,100 @@ int main(int argc, char* argv[]) fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF"); fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); - - switch(quant_tool.algorithm_type) + switch (quant_tool.algorithm_type) + { + case ALGORITHM_MIN_MAX: { - case ALGORITHM_MIN_MAX: + if (quant_tool.scale_file.empty()) { - if (quant_tool.scale_file.empty()) - { - quant_tool.scale_file = "table_minmax.scale"; - quant_tool.activation_quant_tool(); - } - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); - /* Evaluate quantitative losses */ - if (quant_tool.evaluate) - { - fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); - quant_tool.assess_quant_loss(0); - } - break; + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); } - case ALGORITHM_KL: + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) { - if (quant_tool.scale_file.empty()) - { - quant_tool.scale_file = "table_kl.scale"; - quant_tool.activation_quant_tool(); - } - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); - /* Evaluate quantitative losses */ - if (quant_tool.evaluate) - { - fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); - quant_tool.assess_quant_loss(0); - } - break; + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); } - case ALGORITHM_ACIQ: + break; + } + case ALGORITHM_KL: + { + if (quant_tool.scale_file.empty()) { - if (quant_tool.scale_file.empty()) - { - quant_tool.scale_file = "table_aciq.scale"; - quant_tool.activation_quant_tool(); - } - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); - /* Evaluate quantitative losses */ - if (quant_tool.evaluate) - { - fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); - quant_tool.assess_quant_loss(0); - } - break; + quant_tool.scale_file = "table_kl.scale"; + quant_tool.activation_quant_tool(); } - case ALGORITHM_DFQ: + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) { - quant_tool.data_free_quant(); - quant_tool.model_file = "test_dfq_fp32.tmfile"; - if (quant_tool.scale_file.empty()) - { - quant_tool.scale_file = "table_minmax.scale"; - quant_tool.activation_quant_tool(); - } - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); - /* Evaluate quantitative losses */ - if (quant_tool.evaluate) - { - fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); - quant_tool.assess_quant_loss(0); - } - break; + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); } - case ALGORITHM_MM_EQ: + break; + } + case ALGORITHM_ACIQ: + { + if (quant_tool.scale_file.empty()) { - if (quant_tool.scale_file.empty()) - { - quant_tool.scale_file = "table_minmax.scale"; - quant_tool.activation_quant_tool(); - } - /* Evaluate quantitative losses */ - if (quant_tool.evaluate) - { - fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); - quant_tool.assess_quant_loss(0); - } - /* Enable EQ search */ - fprintf(stderr, "[Quant Tools Info]: Step Search, enable EQ search\n"); - quant_tool.quant_search(); - quant_tool.model_file = "save_i8_eq.tmfile"; - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, true); - break; + quant_tool.scale_file = "table_aciq.scale"; + quant_tool.activation_quant_tool(); } - default: + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) { - fprintf(stderr,"Unsupported quantization type ... \n"); - break; + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); } + break; + } + case ALGORITHM_DFQ: + { + quant_tool.data_free_quant(); + quant_tool.model_file = "test_dfq_fp32.tmfile"; + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); + } + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_MM_EQ: + { + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); + } + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + /* Enable EQ search */ + fprintf(stderr, "[Quant Tools Info]: Step Search, enable EQ search\n"); + quant_tool.quant_search(); + quant_tool.model_file = "save_i8_eq.tmfile"; + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, true); + break; + } + default: + { + fprintf(stderr, "Unsupported quantization type ... \n"); + break; + } } - fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n"); diff --git a/tools/quantize/quant_utils.cpp b/tools/quantize/quant_utils.cpp index c8265332e..f938f3617 100644 --- a/tools/quantize/quant_utils.cpp +++ b/tools/quantize/quant_utils.cpp @@ -421,7 +421,7 @@ std::vector histCount(float* data, uint32_t elem_num, float abs_max) if (data[i] != 0) { uint32_t hist_idx = round(std::abs(data[i]) / bin_scale); - hist[hist_idx] ++; + hist[hist_idx]++; } } return hist;