diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index ba8d3d961..b8504e500 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -30,6 +30,8 @@ IF (${TENGINE_TARGET_PROCESSOR} MATCHES "X86") ADD_EXECUTABLE( ${name} ./quant_save_graph.cpp + ./algorithm/quant_dfq.cpp + ./algorithm/quant_eq.cpp ./quant_utils.cpp ../save_graph/save_graph.cpp ../save_graph/tm2_op_save.cpp diff --git a/tools/quantize/README.md b/tools/quantize/README.md index e0e1d7547..968dc1441 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -76,7 +76,7 @@ Status : int8, per-channel, symmetric Before use the quant tool, **you need Float32 tmfile and Calibration Dataset**, the image num of calibration dataset we suggest to use 500-1000. ``` -$ .quant_tool_int8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_int8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017 +$ .quant_tool_int8 -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_int8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017 -z 1 ---- Tengine Post Training Quantization Tool ---- @@ -111,6 +111,38 @@ Thread num : 1 [Quant Tools Info]: Step 4, quantize activation tensor done. [Quant Tools Info]: Step 5, quantize weight tensor done. [Quant Tools Info]: Step 6, save Int8 tmfile done, ./mobilenet_int8.tmfile +[Quant Tools Info]: Step Evaluate, evaluate quantitative losses +cosin 0 32 avg 0.995317 ### 0.000000 0.953895 0.998249 0.969256 ... +cosin 1 32 avg 0.982403 ### 0.000000 0.902383 0.964436 0.873998 ... +cosin 2 64 avg 0.976753 ### 0.952854 0.932301 0.982766 0.958503 ... +cosin 3 64 avg 0.981889 ### 0.976637 0.981754 0.987276 0.970671 ... +cosin 4 128 avg 0.979728 ### 0.993999 0.991858 0.990438 0.992766 ... +cosin 5 128 avg 0.970351 ### 0.772556 0.989541 0.986996 0.989563 ... +cosin 6 128 avg 0.954545 ### 0.950125 0.922964 0.946804 0.972852 ... +cosin 7 128 avg 0.977192 ### 0.994728 0.972071 0.995353 0.992700 ... +cosin 8 256 avg 0.977426 ### 0.968429 0.991248 0.991274 0.994450 ... +cosin 9 256 avg 0.962224 ### 0.985255 0.969171 0.958762 0.967461 ... +cosin 10 256 avg 0.954253 ### 0.984353 0.935643 0.656188 0.929778 ... +cosin 11 256 avg 0.971987 ### 0.997596 0.967681 0.476525 0.999115 ... +cosin 12 512 avg 0.972861 ### 0.968920 0.905907 0.993918 0.622953 ... +cosin 13 512 avg 0.959161 ### 0.935686 0.000000 0.642560 0.994388 ... +cosin 14 512 avg 0.963903 ### 0.979613 0.957169 0.976440 0.902512 ... +cosin 15 512 avg 0.963226 ### 0.977065 0.965819 0.998149 0.905297 ... +cosin 16 512 avg 0.960935 ### 0.861674 0.972926 0.950579 0.987609 ... +cosin 17 512 avg 0.961057 ### 0.738472 0.987884 0.999124 0.995397 ... +cosin 18 512 avg 0.960127 ### 0.935455 0.968909 0.970831 0.981240 ... +cosin 19 512 avg 0.963755 ### 0.972628 0.992305 0.999518 0.799737 ... +cosin 20 512 avg 0.949364 ### 0.922776 0.896038 0.945079 0.971338 ... +cosin 21 512 avg 0.961256 ### 0.902256 0.896438 0.923361 0.973974 ... +cosin 22 512 avg 0.946552 ### 0.963806 0.982075 0.878965 0.929992 ... +cosin 23 512 avg 0.953677 ### 0.953880 0.996364 0.936540 0.930796 ... +cosin 24 1024 avg 0.941197 ### 0.000000 0.992507 1.000000 0.994460 ... +cosin 25 1024 avg 0.973546 ### 1.000000 0.889181 0.000000 0.998084 ... +cosin 26 1024 avg 0.869351 ### 0.522966 0.000000 0.987009 0.000000 ... +cosin 27 1 avg 0.974982 ### 0.974982 +cosin 28 1 avg 0.974982 ### 0.974982 +cosin 29 1 avg 0.974982 ### 0.974982 +cosin 30 1 avg 0.978486 ### 0.978486 ---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\(^0^)/ ---- ``` diff --git a/tools/quantize/algorithm/quant_dfq.cpp b/tools/quantize/algorithm/quant_dfq.cpp new file mode 100644 index 000000000..7b17ddbbf --- /dev/null +++ b/tools/quantize/algorithm/quant_dfq.cpp @@ -0,0 +1,572 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "../quant_tool.hpp" + +//int QuantTool::data_free_quant(const char* model_file, const char* image_dir, +// int img_c, int img_h, int img_w, const float* mean, const float* scale, +// int num_thread, int sw_RGB, int center_crop) +int QuantTool::data_free_quant() +{ + int letterbox = 0; + int loop_count = 1; + const char* image_file = nullptr; + + /* set runtime options */ + struct options opt; + opt.num_thread = num_thread; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_FP32; + + // /* inital tengine */ + // if (init_tengine() != 0) + // { + // fprintf(stderr, "Initial tengine failed.\n"); + // return -1; + // } + // fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); + + /* create graph, load tengine model xxx.tmfile */ + graph_t graph = create_graph(NULL, "tengine", model_file.c_str()); + if (NULL == graph) + { + fprintf(stderr, "Create graph failed.\n"); + fprintf(stderr, "errno: %d \n", get_tengine_errno()); + return -1; + } + + struct graph* graphn = (struct graph*)graph; + struct node_graph* node_proto = (struct node_graph*)sys_malloc(sizeof(struct node_graph) * graphn->node_num); + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_type = n->op.type; + const char* layer_name = n->name; //layer name + + const uint16_t input_num = n->input_num; //input num + const uint16_t output_num = n->output_num; //output num + + node_proto[i].pass = 0; + // node_proto[i].input_node_list = create_vector(sizeof(uint16_t), NULL); + // node_proto[i].output_node_list = create_vector(sizeof(uint16_t), NULL); + + for (int j = 0; j < input_num; j++) + { + struct tensor* input_tensor = get_ir_graph_tensor(graphn, n->input_tensors[j]); + const char* input_tensor_name = input_tensor->name; + uint8_t dim_num = input_tensor->dim_num; + + if (input_tensor->producer >= 0) + { + struct node* node = graphn->node_list[input_tensor->producer]; + node_proto[i].input_node_list.push_back(node->index); + node_proto[node->index].output_node_list.push_back(i); + } + if (OP_CONV == op_type || OP_FC == op_type) + { + break; + } + } + } + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_type = n->op.type; + const char* layer_name = n->name; //layer name + if (op_type != NULL) + { + if (OP_CONV != op_type && OP_FC != op_type) + { + if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + uint16_t node_output_id = node_proto[i].output_node_list[0]; + if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1) + { + node_proto[i].input_node_list.erase(node_proto[i].input_node_list.begin() + 0); + node_proto[i].output_node_list.erase(node_proto[i].output_node_list.begin() + 0); + + node_proto[node_input_id].output_node_list.erase(node_proto[node_input_id].output_node_list.begin() + 0); + node_proto[node_input_id].output_node_list.push_back(node_output_id); + + node_proto[node_output_id].input_node_list.erase(node_proto[node_output_id].input_node_list.begin() + 0); + node_proto[node_output_id].input_node_list.push_back(node_input_id); + } + } + } + } + } + + for (int i = 0; i < graphn->node_num; i++) + { + struct node* n = graphn->node_list[i]; //ir node + const uint16_t node_idx = n->index; //node idx + auto op_name = n->op.type; + const char* layer_name = n->name; //layer name + + const uint16_t input_num = n->input_num; //input num + const uint16_t output_num = n->output_num; //output num + + if (op_name != NULL) + { + if (OP_CONV == op_name) + { + // DW_Conv && Direct_Conv + struct conv_param* conv_param = (struct conv_param*)n->op.param_mem; + if (conv_param->group == conv_param->output_channel) + { + // printf(" #### DW Conv ####\n"); + if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + uint16_t node_output_id = node_proto[i].output_node_list[0]; + auto op_name0 = graphn->node_list[node_input_id]->op.type; + auto op_name2 = graphn->node_list[node_input_id]->op.type; + + if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1 && OP_CONV == op_name0 && OP_CONV == op_name2) + { + node_proto[i].pass = 1; //layer1 + node_proto[node_input_id].pass = 1; //layer0 + node_proto[node_output_id].pass = 1; //layer2 + + // layer0 min/max range + struct node* nodeP = graphn->node_list[node_input_id]; + struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + uint16_t dims0 = input_tensor->dims[0]; + uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer0_max(dims0, 0.0f); + std::vector layer0_min(dims0, 0.0f); + std::vector layer0_range(dims0, 0.0f); + + float* data_layer0 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) + layer0_max[d0] = data_layer0[dims123 * d0 + d1]; + if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) + layer0_min[d0] = data_layer0[dims123 * d0 + d1]; + } + } + // printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; + } + + // layer1 min/max range + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer1_max(dims0, 0.0f); + std::vector layer1_min(dims0, 0.0f); + std::vector layer1_range(dims0, 0.0f); + + float* data_layer1 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer1[dims123 * d0 + d1] > layer1_max[d0]) + layer1_max[d0] = data_layer1[dims123 * d0 + d1]; + if (data_layer1[dims123 * d0 + d1] < layer1_max[d0]) + layer1_min[d0] = data_layer1[dims123 * d0 + d1]; + } + } + // printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; + } + + // layer2 min/max range + nodeP = graphn->node_list[node_output_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + uint16_t dims1 = input_tensor->dims[1]; + uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer2_max(dims0, 0.0f); + std::vector layer2_min(dims0, 0.0f); + std::vector layer2_range(dims0, 0.0f); + + float* data_layer2 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer2_max[d1]) + { + layer2_max[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer2_min[d1]) + { + layer2_min[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + } + } + } + // printf("### %d ###\n",dims1); + for (int d1 = 0; d1 < dims1; d1++) + { + layer2_range[d1] = layer2_max[d1] - layer2_min[d1]; + } + + ////////////////////////////////////////////////////////////////////////////////// + + // layer ops sqrt + float ops_range[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + ops_range[ops] = pow(layer0_range[ops] * layer1_range[ops] * layer2_range[ops], 1.0 / 3); + } + + float S01[dims1]; + float S01_F[dims1]; + float S12[dims1]; + float S12_F[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + if (ops_range[ops] == 0) + { + S01[ops] = 0.0; + S12_F[ops] = 0.0; + } + else + { + S01[ops] = layer0_range[ops] / ops_range[ops]; + S12_F[ops] = layer2_range[ops] / ops_range[ops]; + } + if (layer0_range[ops] == 0) + S01_F[ops] = 0.0; + else + S01_F[ops] = ops_range[ops] / layer0_range[ops]; + if (layer2_range[ops] == 0) + S12[ops] = 0.0; + else + S12[ops] = ops_range[ops] / layer2_range[ops]; + } + ////////////////////////////////////////////////////////////////////////////////// + + // layer0 output + nodeP = graphn->node_list[node_input_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer0_bias = (float*)sys_malloc(sizeof(float) * dims0); + data_layer0_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; + } + + // layer1 output + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer1[dims123 * d0 + d1] = data_layer1[dims123 * d0 + d1] * S01[d0] * S12_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer1_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer1_bias[d0] = data_layer1_bias[d0] * S12_F[d0]; + } + + // layer2 output + nodeP = graphn->node_list[node_output_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims1 = input_tensor->dims[1]; + dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] * S12[d1]; + } + } + } + } + } + } + else + { + // printf(" #### Direct Conv ####\n"); + if (node_proto[i].pass == 0) + { + if (node_proto[i].input_node_list.size() == 1) + { + uint16_t node_input_id = node_proto[i].input_node_list[0]; + if (graphn->node_list[node_input_id]->input_num > 0) + { + auto op_name0 = graphn->node_list[node_input_id]->op.type; + + if (node_proto[node_input_id].output_node_list.size() == 1 && op_name0 == OP_CONV) + { + node_proto[i].pass = 1; //layer1 + node_proto[node_input_id].pass = 1; //layer0 + + // layer0 min/max range + struct node* nodeP = graphn->node_list[node_input_id]; + struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + uint16_t dims0 = input_tensor->dims[0]; + uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer0_max(dims0, 0.0f); + std::vector layer0_min(dims0, 0.0f); + std::vector layer0_range(dims0, 0.0f); + + float* data_layer0 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + if (data_layer0[dims123 * d0 + d1] > layer0_max[d0]) + layer0_max[d0] = data_layer0[dims123 * d0 + d1]; + if (data_layer0[dims123 * d0 + d1] < layer0_max[d0]) + layer0_min[d0] = data_layer0[dims123 * d0 + d1]; + } + } + // printf("### %d ###\n",dims0); + for (int d0 = 0; d0 < dims0; d0++) + { + layer0_range[d0] = layer0_max[d0] - layer0_min[d0]; + } + + // layer1 min/max range + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + uint16_t dims1 = input_tensor->dims[1]; + uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + + std::vector layer1_max(dims0, 0.0f); + std::vector layer1_min(dims0, 0.0f); + std::vector layer1_range(dims0, 0.0f); + + float* data_layer1 = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer1_max[d1]) + { + layer1_max[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer1_min[d1]) + { + layer1_min[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2]; + } + } + } + } + // printf("### %d ###\n",dims1); + for (int d0 = 0; d0 < dims1; d0++) + { + layer1_range[d0] = layer1_max[d0] - layer1_min[d0]; + } + + ////////////////////////////////////////////////////////////////////////////////// + + // layer ops sqrt + float ops_range[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + ops_range[ops] = sqrt(layer0_range[ops] * layer1_range[ops]); + } + + float S01[dims1]; + float S01_F[dims1]; + for (int ops = 0; ops < dims1; ops++) + { + if (ops_range[ops] == 0) + { + S01[ops] = 0.0; + } + else + { + S01[ops] = layer0_range[ops] / ops_range[ops]; + } + if (layer0_range[ops] == 0) + S01_F[ops] = 0.0; + else + S01_F[ops] = ops_range[ops] / layer0_range[ops]; + } + ////////////////////////////////////////////////////////////////////////////////// + // layer0 output + nodeP = graphn->node_list[node_input_id]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims123; d1++) + { + data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0]; + } + } + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]); + dims0 = input_tensor->dims[0]; + float* data_layer0_bias = (float*)sys_malloc(sizeof(float) * dims0); + data_layer0_bias = (float*)input_tensor->data; + for (int d0 = 0; d0 < dims0; d0++) + { + data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0]; + } + + // layer1 output + nodeP = graphn->node_list[i]; + input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]); + dims0 = input_tensor->dims[0]; + dims1 = input_tensor->dims[1]; + dims23 = input_tensor->dims[2] * input_tensor->dims[3]; + for (int d0 = 0; d0 < dims0; d0++) + { + for (int d1 = 0; d1 < dims1; d1++) + { + for (int d2 = 0; d2 < dims23; d2++) + { + data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] * S01[d1]; + } + } + } + } + } + } + } + } + } + } + } + + if (!save_graph(graph, "test_dfq_fp32.tmfile")) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + /* set the shape, data buffer of input_tensor of the graph */ + int img_size = img_h * img_w * img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if (input_tensor == NULL) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(graph, opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + std::vector imgs_list; + if (image_dir.c_str() != NULL) + { + readFileList(image_dir, imgs_list); + } + else + { + imgs_list.push_back(image_file); + } + uint32_t img_num = imgs_list.size(); + + /* prepare process input data, set the data mem to input tensor */ + get_input_data_cv(imgs_list[0].c_str(), input_data, img_c, img_h, img_w, mean, scale, + 1, 0, 0, 0, 0); + + /* run graph */ + for (int i = 0; i < loop_count; i++) + { + double start = get_current_time(); + if (run_graph(graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + } + + /* get the result of classification */ + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + float* output_data = (float*)get_tensor_buffer(output_tensor); + int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); + + // printf("out put data %f %d \n",output_data[0], output_size); + fprintf(stderr, "--------------------------------------\n"); + + /* release tengine */ + free(input_data); + postrun_graph(graph); + destroy_graph(graph); + // release_tengine(); + + return 0; +} diff --git a/tools/quantize/algorithm/quant_eq.cpp b/tools/quantize/algorithm/quant_eq.cpp new file mode 100644 index 000000000..b8e80fe6b --- /dev/null +++ b/tools/quantize/algorithm/quant_eq.cpp @@ -0,0 +1,1283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "../quant_tool.hpp" + +int QuantTool::init() +{ + // ir graph variable + this->fp32_out.clear(); + this->fake_quant_out.clear(); + + /* load fp32 graph and fake quant graph */ + this->graphn_fp32 = (struct graph*)create_graph(nullptr, "tengine", this->model_file.c_str()); + this->graphn_fake_quant = (struct graph*)create_graph(nullptr, "tengine", this->model_file.c_str()); + + if (this->graphn_fp32 == nullptr || this->graphn_fake_quant == nullptr) + { + fprintf(stderr, "Create graph failed.\n"); + fprintf(stderr, "errno: %d \n", get_tengine_errno()); + return -1; + } + + /* load activation scale to ir_tensor */ + this->load_activation_scale(this->graphn_fp32, this->scale_file.c_str(), this->inplace); + this->load_activation_scale(this->graphn_fake_quant, this->scale_file.c_str(), this->inplace); + + /* get graph input tensor */ + this->graph_input_tensor_fp32 = (struct tensor*)get_graph_input_tensor((void*)this->graphn_fp32, 0, 0); + this->graph_input_tensor_fake_quant = (struct tensor*)get_graph_input_tensor((void*)this->graphn_fake_quant, 0, 0); + if (this->graph_input_tensor_fp32 == nullptr || this->graph_input_tensor_fake_quant == nullptr) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + /* generate images list */ + std::vector imgs_list; + if (!this->image_dir.empty()) + readFileList(this->image_dir, imgs_list); + uint32_t img_num = imgs_list.size(); + + this->max_search_img_num = 50; + if (img_num < this->max_search_img_num) + this->max_search_img_num = img_num; + + // fprintf(stderr, "# eq dataset num %d\n", this->max_search_img_num); + + /* set the shape, data buffer of input_tensor of the graph */ + this->img_size = this->img_h * this->img_w * this->img_c; + int dims[] = {1, img_c, img_h, img_w}; // nchw + float* input_data_fp32 = (float*)malloc(this->img_size * sizeof(float)); + float* input_data_fake_quant = (float*)malloc(this->img_size * sizeof(float)); + + /* prepare process input data, set the data mem to input tensor */ + float scale_graph_input = this->graph_input_tensor_fake_quant->scale; + int zero_point_graph_input = this->graph_input_tensor_fake_quant->zero_point; + // fprintf(stderr, "scale zp %f %d\n", scale_graph_input, zero_point_graph_input); + + this->input_datas_fp32.resize(this->max_search_img_num); + this->input_datas_fake_quant.resize(this->max_search_img_num); + + for (int i = 0; i < this->max_search_img_num; i++) + { + this->input_datas_fp32[i].resize(this->img_size); + this->input_datas_fake_quant[i].resize(this->img_size); + + get_input_data_cv(imgs_list[i].c_str(), this->input_datas_fp32[i].data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + this->input_datas_fake_quant[i] = this->input_datas_fp32[i]; + this->activation_requant(this->input_datas_fake_quant[i].data(), this->img_size, 8, 1, scale_graph_input, + zero_point_graph_input); + } + + /* set graph input shape */ + int ret_fp32 = set_tensor_shape(this->graph_input_tensor_fp32, dims, 4); + int ret_fake_quant = set_tensor_shape(this->graph_input_tensor_fake_quant, dims, 4); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + /* set graph input buffer */ + ret_fp32 = set_tensor_buffer(this->graph_input_tensor_fp32, input_data_fp32, this->img_size * 4); + ret_fake_quant = set_tensor_buffer(this->graph_input_tensor_fake_quant, input_data_fake_quant, this->img_size * 4); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread((void*)this->graphn_fp32, this->opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + ret_fp32 = prerun_graph_multithread((void*)this->graphn_fp32, this->opt); + ret_fake_quant = prerun_graph_multithread((void*)this->graphn_fake_quant, this->opt); + if (ret_fp32 < 0 || ret_fake_quant < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + /* get exec graph */ + this->exec_graph_fp32 = this->get_exec_graph(this->graphn_fp32); + this->exec_graph_fake_quant = this->get_exec_graph(this->graphn_fake_quant); + this->exec_node_num = get_vector_num(this->exec_graph_fp32->exec_node_list); + + /* ir idx <<<->>> exec idx */ + for (int i = 0; i < this->exec_node_num; i++) + { + this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, i); + this->node_fake_quant = (struct exec_node*)get_vector_data(this->exec_graph_fake_quant->exec_node_list, i); + + int out_t = node_fp32->ir_node->output_tensors[0]; + this->ir_exec[graphn_fp32->tensor_list[out_t]->producer] = i; // ir idx --> exec idx + this->exec_ir[i] = graphn_fp32->tensor_list[out_t]->producer; // exec idx --> ir idx + // printf(" %d : %d\n", graphn_fp32->tensor_list[out_t]->producer, i); + } + + /* check for free node*/ + this->check_for_free(); + + return 0; +} + +void QuantTool::activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point) +{ + // symmetry = 0; + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + for (int i = 0; i < elem_num; i++) + { + data[i] = round(data[i] / scale) + zero_point; + data[i] = data[i] > fake_quant_max ? fake_quant_max : data[i]; + data[i] = data[i] < fake_quant_min ? fake_quant_min : data[i]; + data[i] = (data[i] - zero_point) * scale; + } +} + +void QuantTool::recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, + dict_str2int& layer_used, dict_str2float& layer_scale, dict_str2float& layer_zeropoint, dict_str2int& layer_pass) +{ + if (layer_pass[t->name] == 0 && layer_used[t->name] < 2) + { + t->scale = layer_scale[layer_name]; + t->zero_point = layer_zeropoint[layer_name]; + layer_scale[t->name] = layer_scale[layer_name]; + layer_zeropoint[t->name] = layer_zeropoint[layer_name]; + + uint32_t ir_node_idx = t->producer; + struct node* t_node = graphn->node_list[ir_node_idx]; + + auto op_name = t_node->op.type; + bool poolTrue = false; + bool reluTrue = false; + if (op_name == OP_POOL) + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == OP_RELU) + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) + { + QuantTool::recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass); + } + } + } + layer_pass[t->name] = 1; + } +} + +struct exec_graph* QuantTool::get_exec_graph(struct graph* graphn) +{ + struct subgraph* subgraph = get_ir_graph_subgraph(graphn, 0); + struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph; + + return exec_graph; +} + +void QuantTool::check_for_free() +{ + dict_uint2uint nodeA2B; + for (int i = 0; i < this->exec_node_num; i++) + { + this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, i); + this->op_name = this->node_fp32->ir_node->op.type; + + for (int j = 0; j < this->node_fp32->ir_node->input_num; j++) + { + struct tensor* t = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[j]]; + if (t->tensor_type == 1) + { + uint32_t ir_idx = t->producer; + nodeA2B[this->ir_exec[ir_idx]] = i; + } + } + } + + for (auto iter = nodeA2B.begin(); iter != nodeA2B.end(); iter++) + { + this->dict_free[iter->second].push_back(iter->first); + // printf(" map %d %d\n", iter->first, iter->second); + } +} + +void QuantTool::check_for_interlearve() +{ + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + /* get weight tensor */ + this->weight_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[1]]; + this->weight_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[1]]; + this->weight_size = this->weight_tensor_fp32->elem_num * this->weight_tensor_fp32->elem_size; + + this->weight_data_fp32 = (float*)this->weight_tensor_fp32->data; + this->weight_data_fake_quant = (float*)this->weight_tensor_fake_quant->data; + + if (this->op_name == OP_CONV) + { + this->conv_param_fp32 = (struct conv_param*)this->node_fp32->ir_node->op.param_mem; + this->conv_param_fake_quant = (struct conv_param*)this->node_fake_quant->ir_node->op.param_mem; + + if (this->conv_param_fp32->group != this->conv_param_fp32->output_channel) + { + this->conv_priv_info_fp32 = (struct conv_priv_info*)this->node_fp32->ops_priv; + this->conv_priv_info_fake_quant = (struct conv_priv_info*)this->node_fake_quant->ops_priv; + + this->interleave_size_fake = this->conv_priv_info_fp32->interleave_buffer_pack4_size; + + this->interleave_buffer_fp32 = (float*)this->conv_priv_info_fp32->interleave_buffer_pack4; + this->interleave_buffer_fake_quant = (float*)this->conv_priv_info_fake_quant->interleave_buffer_pack4; + } + } + else + this->interleave_size_fake = 0; + } +} + +void QuantTool::weight_bias_requant(int search) +{ + /* weight requant */ + // printf("### 1.1 this->weight_tensor_fake_quant->scale %f\n",this->weight_tensor_fake_quant->scale); + if (0 == search) + this->weight_requant(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, this->weight_tensor_fake_quant->dims[0]); + + if (this->interleave_size_fake != 0) + { + int M = this->weight_tensor_fake_quant->dims[0]; + int K = this->weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; + this->conv_hcl_interleave_pack4_fp32(M, K, this->weight_data_fake_quant, this->interleave_buffer_fake_quant); + } + + /* bias requant */ + if (this->node_fake_quant->ir_node->input_num > 2) + { + this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[0]]; + this->bias_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[2]]; + this->bias_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[2]]; + this->bias_size = this->bias_tensor_fp32->elem_num * this->bias_tensor_fp32->elem_size; + this->bias_data_fp32 = (float*)this->bias_tensor_fp32->data; + this->bias_data_fake_quant = (float*)this->bias_tensor_fake_quant->data; + this->bias_requant(this->input_tensor_fake_quant, this->weight_tensor_fake_quant, this->bias_tensor_fake_quant, + this->bias_data_fake_quant, this->bias_tensor_fake_quant->elem_num, this->bias_tensor_fake_quant->dims[0]); + // this->bias_tensor_fp32->scale = this->bias_tensor_fake_quant->scale; + } +} + +void QuantTool::set_node_input_output_tensor(int idx, int imgi, int snum) +{ + this->out_imgs_fp32[imgi].resize(this->output_tensor_fp32->elem_num); + this->out_imgs_fake_quant[imgi].resize(this->output_tensor_fp32->elem_num); + + if (idx == 0) + { + set_tensor_buffer(this->graph_input_tensor_fp32, this->input_datas_fp32[imgi].data(), this->img_size * 4); + set_tensor_buffer(this->graph_input_tensor_fake_quant, this->input_datas_fake_quant[imgi].data(), this->img_size * 4); + } + else + { + for (int inputi = 0; inputi < this->node_fp32->ir_node->input_num; inputi++) + { + uint32_t ir_input_tensor_idx = this->node_fp32->ir_node->input_tensors[inputi]; + this->input_tensor_fp32 = this->graphn_fp32->tensor_list[ir_input_tensor_idx]; + this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[ir_input_tensor_idx]; + + if (this->input_tensor_fp32->tensor_type == 1) + { + uint32_t ir_node_idx = this->input_tensor_fp32->producer; + uint32_t input_size = this->input_tensor_fp32->elem_num * input_tensor_fp32->elem_size; + + uint32_t exec_node_idx = this->ir_exec[ir_node_idx]; + + if (imgi == 0 && snum == 0) + { + float* buf_fp32 = (float*)sys_malloc(32); + float* buf_fake_quant = (float*)sys_malloc(32); + + set_tensor_buffer(this->input_tensor_fp32, buf_fp32, input_size); + set_tensor_buffer(this->input_tensor_fake_quant, buf_fake_quant, input_size); + + set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); + set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); + } + else + { + set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size); + set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size); + } + } // output tensor + } // node input number + } // node i > 0 + + /* init output buffer */ + set_tensor_buffer(this->output_tensor_fp32, this->out_imgs_fp32[imgi].data(), this->output_tensor_fp32->elem_num * this->output_tensor_fp32->elem_size); + set_tensor_buffer(this->output_tensor_fake_quant, this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num * this->output_tensor_fake_quant->elem_size); +} + +double QuantTool::cosin_similarity(std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num) +{ + double norm_a = 0; + double norm_b = 0; + double a_b = 0; + + uint32_t fnum = (output_num >> 4) << 4; + uint32_t rnum = output_num - fnum; + +#if 0 //__AVX__ + + float _sumaa0[8] = {0.f}; + float _sumbb0[8] = {0.f}; + float _sumaabb0[8] = {0.f}; + float _sumaa1[8] = {0.f}; + float _sumbb1[8] = {0.f}; + float _sumaabb1[8] = {0.f}; + + __m256 _suma_o0 = _mm256_set1_ps(0.0); + __m256 _sumb_o0 = _mm256_set1_ps(0.0); + __m256 _sumab_o0 = _mm256_set1_ps(0.0); + __m256 _suma_o1 = _mm256_set1_ps(0.0); + __m256 _sumb_o1 = _mm256_set1_ps(0.0); + __m256 _sumab_o1 = _mm256_set1_ps(0.0); + + for (int i = 0; i < imgs_num; i++) + { + const float* in_a_addr = in_a[i].data(); + const float* in_b_addr = in_b[i].data(); + for (int j = 0; j < fnum; j=j+32) + { + __m256 _in_a0 = _mm256_loadu_ps(in_a_addr+j); + __m256 _in_b0 = _mm256_loadu_ps(in_b_addr+j); + __m256 _in_a1 = _mm256_loadu_ps(in_a_addr+j+8); + __m256 _in_b1 = _mm256_loadu_ps(in_b_addr+j+8); + + _suma_o0 = _mm256_fmadd_ps(_in_a0, _in_a0, _suma_o0); + _sumb_o0 = _mm256_fmadd_ps(_in_b0, _in_b0, _sumb_o0); + _sumab_o0 = _mm256_fmadd_ps(_in_a0, _in_b0, _sumab_o0); + _suma_o1 = _mm256_fmadd_ps(_in_a1, _in_a1, _suma_o1); + _sumb_o1 = _mm256_fmadd_ps(_in_b1, _in_b1, _sumb_o1); + _sumab_o1 = _mm256_fmadd_ps(_in_a1, _in_b1, _sumab_o1); + } + } + _mm256_storeu_ps(_sumaa0, _suma_o0); + _mm256_storeu_ps(_sumbb0, _sumb_o0); + _mm256_storeu_ps(_sumaabb0, _sumab_o0); + _mm256_storeu_ps(_sumaa1, _suma_o1); + _mm256_storeu_ps(_sumbb1, _sumb_o1); + _mm256_storeu_ps(_sumaabb1, _sumab_o1); + + for (int i = 0; i < 8; i++) + { + norm_a += _sumaa0[i] + _sumaa1[i]; + norm_b += _sumbb0[i] + _sumbb1[i]; + a_b += _sumaabb0[i] + _sumaabb1[i]; + + } + +#else // normal + // printf("AAAA DIRECT\n"); + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < fnum; j = j + 8) + { + for (int k = 0; k < 8; k = k + 1) + { + norm_a += in_a[i][j + k] * in_a[i][j + k]; + + norm_b += in_b[i][j + k] * in_b[i][j + k]; + + a_b += in_a[i][j + k] * in_b[i][j + k]; + } + } + } + +#endif // __SSE__ __AVX__ + + for (int j = fnum; j < output_num; j++) + { + for (int i = 0; i < imgs_num; i++) + { + norm_a += in_a[i][j] * in_a[i][j]; + norm_b += in_b[i][j] * in_b[i][j]; + a_b += in_a[i][j] * in_b[i][j]; + } + } + + double cosin = 0.0; + double _a_b_ = sqrt(norm_a) * sqrt(norm_b); + if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin = a_b; + else + cosin = a_b / _a_b_; + if (cosin < -999999 || cosin > 999999) + cosin = 0; + return cosin; +} + +double QuantTool::cosin_similarity(std::vector* in_a, std::vector* in_b, uint32_t imgs_num, uint32_t output_num) +{ + uint32_t output_channel = 1; + std::vector norm_a(output_channel, 0.0); + std::vector norm_b(output_channel, 0.0); + std::vector a_b(output_channel, 0.0); + + int elem_perchannel = int(output_num / output_channel); + + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < output_channel; j++) + { + for (int k = 0; k < elem_perchannel; k++) + { + int elem_idx = j * elem_perchannel + k; + norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; + norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; + a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; + } + } + } + + double cosin; + for (int j = 0; j < output_channel; j++) + { + double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); + if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin = a_b[j]; + else + cosin = a_b[j] / _a_b_; + if (cosin < -999999 || cosin > 999999) + cosin = 0; + } + return cosin; +} + +void QuantTool::weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) +{ + float* scale_list = (float*)sys_malloc(elem_channel * 4); + int* zero_point_list = (int*)sys_malloc(elem_channel * 4); + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + float scale = 1; + int zero_point = 0; + for (int c = 0; c < elem_channel; c++) + { + float weight_max = *std::max_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + float weight_min = *std::min_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + if (symmetry == 1) + { + if (abs(weight_max) > abs(weight_min)) + scale = abs(weight_max) / fake_quant_max; + else + scale = abs(weight_min) / fake_quant_max; + zero_point = 0; + } + else + { + scale = (weight_max - weight_min) / fake_quant_max; + zero_point = int(-weight_min / scale); + } + + scale_list[c] = scale; + zero_point_list[c] = zero_point; + } + + if (weight_tensor->scale_list == NULL) + { + // printf(" EMPTY\n "); + weight_tensor->scale_list = scale_list; + weight_tensor->zp_list = zero_point_list; + } + else + { + scale_list = weight_tensor->scale_list; + zero_point_list = weight_tensor->zp_list; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale_list[i]) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale_list[i]; + } + } + } +} + +void QuantTool::conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t) +{ + int nn_outch = M >> 3; + int remain_outch_start = nn_outch << 3; + + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + const float* k0 = pA + (p + 0) * K; + const float* k1 = pA + (p + 1) * K; + const float* k2 = pA + (p + 2) * K; + const float* k3 = pA + (p + 3) * K; + const float* k4 = pA + (p + 4) * K; + const float* k5 = pA + (p + 5) * K; + const float* k6 = pA + (p + 6) * K; + const float* k7 = pA + (p + 7) * K; + + float* ktmp = pA_t + (p / 8) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp[1] = k1[0]; + ktmp[2] = k2[0]; + ktmp[3] = k3[0]; + ktmp[4] = k4[0]; + ktmp[5] = k5[0]; + ktmp[6] = k6[0]; + ktmp[7] = k7[0]; + ktmp += 8; + + k0 += 1; + k1 += 1; + k2 += 1; + k3 += 1; + k4 += 1; + k5 += 1; + k6 += 1; + k7 += 1; + } + } + + nn_outch = (M - remain_outch_start) >> 2; + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + const float* k0 = pA + (p + 0) * K; + const float* k1 = pA + (p + 1) * K; + const float* k2 = pA + (p + 2) * K; + const float* k3 = pA + (p + 3) * K; + + float* ktmp = pA_t + (p / 8 + (p % 8) / 4) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp[1] = k1[0]; + ktmp[2] = k2[0]; + ktmp[3] = k3[0]; + ktmp += 4; + + k0 += 1; + k1 += 1; + k2 += 1; + k3 += 1; + } + } + + remain_outch_start += nn_outch << 2; + + for (int p = remain_outch_start; p < M; p++) + { + const float* k0 = pA + (p + 0) * K; + + float* ktmp = pA_t + (p / 8 + (p % 8) / 4 + p % 4) * 8 * K; + + for (int q = 0; q < K; q++) + { + ktmp[0] = k0[0]; + ktmp++; + k0++; + } + } +} + +void QuantTool::gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel) +{ + float* scale_list = (float*)sys_malloc(elem_channel * 4); + int* zero_point_list = (int*)sys_malloc(elem_channel * 4); + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + float scale = 1; + int zero_point = 0; + for (int c = 0; c < elem_channel; c++) + { + float weight_max = *std::max_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + float weight_min = *std::min_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel); + if (symmetry == 1) + { + if (abs(weight_max) > abs(weight_min)) + scale = abs(weight_max) / fake_quant_max; + else + scale = abs(weight_min) / fake_quant_max; + zero_point = 0; + } + else + { + scale = (weight_max - weight_min) / fake_quant_max; + zero_point = int(-weight_min / scale); + } + + scale_list[c] = scale; + zero_point_list[c] = zero_point; + } + + weight_tensor->scale_list = scale_list; + weight_tensor->zp_list = zero_point_list; +} + +void QuantTool::bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, + float* data, int elem_num, int elem_channel) +{ + int elem_perchannel = elem_num / elem_channel; + float* scale_list = (float*)sys_malloc(elem_channel * 4); + + for (int c = 0; c < elem_channel; c++) + { + float input_scale = input_tensor->scale; + float weight_scale = weight_tensor->scale_list[c]; + float bias_scale = input_scale * weight_scale; + scale_list[c] = bias_scale; + } + + bias_tensor->scale_list = scale_list; + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + { + data[data_idx] = 0; + } + else + { + data[data_idx] = round(data[data_idx] / scale_list[i]); + data[data_idx] = data[data_idx] * scale_list[i]; + } + } + } +} + +void QuantTool::weight_bias_reset() +{ + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + std::memcpy(this->weight_data_fake_quant, this->weight_data_fp32, this->weight_size); + std::memcpy(this->interleave_buffer_fake_quant, this->interleave_buffer_fp32, this->interleave_size_fake); + if (this->node_fake_quant->ir_node->input_num > 2) + { + memcpy(this->bias_data_fake_quant, this->bias_data_fp32, this->bias_size); + } + } +} + +void QuantTool::free_used_layers(int idx) +{ + // printf("#### free 0 idx %d\n",idx); + if (this->dict_free[idx].size() > 0) + { + // printf("#### free 1 idx %d\n",idx); + std::vector > freen_fp32; + std::vector > freen_fake_quant; + for (int fi = 0; fi < this->dict_free[idx].size(); fi++) + { + if (this->dict_free[idx][fi] != 0) + { + // printf("---free---\n"); + this->fp32_out[this->dict_free[idx][fi]].clear(); + this->fake_quant_out[this->dict_free[idx][fi]].clear(); + } + } + } +} + +void QuantTool::load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc) +{ + std::unordered_map layer_scale; + std::unordered_map layer_zeropoint; + bool parse_from_file = false; + if (nullptr != scale_file) + { + std::ifstream scales(scale_file); + std::string line; + while (std::getline(scales, line)) + { + std::string layer_name; + float scale_val = 0.f; + float zero_point = 0.f; + size_t last = 0; + size_t index = line.find_first_of(" ", last); + size_t idx = line.find_last_of(" ", line.size()); + layer_name = line.substr(last, index - last); + // printf("layer_name : %s \n", layer_name.c_str()); + last = index + 1; + scale_val = atof((line.substr(last, line.size() - last)).c_str()); + zero_point = atof((line.substr(idx + 1, line.size())).c_str()); + + layer_scale[layer_name] = scale_val; + layer_zeropoint[layer_name] = zero_point; + // fprintf(stderr, "quant value : %s %f %f \n", layer_name.c_str(), scale_val, zero_point); + } + } + + std::unordered_map layer_used; + for (int i = 0; i < graphn->node_num; i++) + { + struct node* noden = graphn->node_list[i]; + for (int j = 0; j < noden->input_num; j++) + { + std::string layern = graphn->tensor_list[noden->input_tensors[j]]->name; + layer_used[layern]++; + } + } + + if (mode_sc == 0) + { + for (int i = 0; i < graphn->tensor_num; i++) + { + struct tensor* t = graphn->tensor_list[i]; + if (t->tensor_type == 1 || t->tensor_type == 3) + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + } + } + } + else + { + std::unordered_map layer_pass; + for (int i = graphn->tensor_num - 1; i >= 0; i--) + { + struct tensor* t = graphn->tensor_list[i]; + if (t->tensor_type == 1 || t->tensor_type == 3) + { + if (layer_pass[t->name] == 0) + { + uint32_t ir_node_idx = t->producer; + struct node* t_node = graphn->node_list[ir_node_idx]; + + auto op_name = t_node->op.type; + + bool poolTrue = false; + bool reluTrue = false; + if (op_name == OP_POOL) + { + struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem; + if (pool_param->pool_method == 0) + poolTrue = true; + } + else if (op_name == OP_RELU) + { + struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem; + if (relu_param->negative_slope == 0.f) + reluTrue = true; + } + + if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || poolTrue || reluTrue) + { + struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]]; + if (layer_scale[t->name] != 0) + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + + if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3) + { + this->recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, + layer_zeropoint, layer_pass); + } + } + } + else + { + t->scale = layer_scale[t->name]; + t->zero_point = layer_zeropoint[t->name]; + } + layer_pass[t->name] = 1; + } + } + } + } + + // for (int i = 0; i < graphn->tensor_num; i++) + // { + // struct ir_tensor* t = graphn->tensor_list[i]; + // if (t->tensor_type == 1 || t->tensor_type == 3) + // { + // printf(" sz %s %f %d \n",t->name, t->scale, t->zero_point); + // } + // } +} + +int QuantTool::get_exec_node_message(int exec_node_idx) +{ + /* get node */ + this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, exec_node_idx); + this->node_fake_quant = (struct exec_node*)get_vector_data(this->exec_graph_fake_quant->exec_node_list, exec_node_idx); + + /* get op type */ + this->op_name = this->node_fp32->ir_node->op.type; + + /* get exec ops */ + this->node_ops_fp32 = this->node_fp32->node_ops; + this->node_ops_fake_quant = this->node_fake_quant->node_ops; + + /* handle the shape changed and dynamic shape case */ + if (this->node_ops_fp32->reshape && this->node_ops_fp32->reshape(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32) + && this->node_ops_fake_quant->reshape && this->node_ops_fake_quant->reshape(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant) < 0) + { + TLOG_ERR("failed to reshape node %d, %s\n", node_fp32->ir_node->index, node_fp32->ir_node->name); + return -1; + } + + /* get output tensor */ + this->output_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->output_tensors[0]]; + this->output_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->output_tensors[0]]; + + /* get exec ops */ + this->execidx_elemnum[exec_node_idx] = this->output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[exec_node_idx] = this->output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[exec_node_idx] = this->output_tensor_fp32->name; //exec idx --> output tensor name + + return 0; +} + +void QuantTool::cosin_similarity(std::vector& cosin, std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel) // cosin dis perchannel +{ + // fprintf(stderr, " in_a %f ",in_a[0][0]); + // fprintf(stderr, " in_b %f ",in_b[0][0]); + + std::vector norm_a(output_channel, 0.0); + std::vector norm_b(output_channel, 0.0); + std::vector a_b(output_channel, 0.0); + + int elem_perchannel = int(output_num / output_channel); + + for (int i = 0; i < imgs_num; i++) + { + for (int j = 0; j < output_channel; j++) + { + for (int k = 0; k < elem_perchannel; k++) + { + int elem_idx = j * elem_perchannel + k; + norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx]; + norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx]; + a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx]; + } + } + } + + cosin.resize(output_channel); + for (int j = 0; j < output_channel; j++) + { + double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]); + // fprintf(stderr, " %lf %f %f \n ", _a_b_, sqrt(norm_a[j]), sqrt(norm_b[j]) ); + if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f) + cosin[j] = a_b[j]; + else + cosin[j] = a_b[j] / _a_b_; + if (cosin[j] < -999999 || cosin[j] > 999999) + cosin[j] = 0; + } +} + +int QuantTool::assess_quant_loss(int gen) +{ + this->init(); + for (int i = 0; i < this->exec_node_num; i++) + { + this->get_exec_node_message(i); + this->check_for_interlearve(); + + this->out_imgs_fp32.resize(this->max_search_img_num); + this->out_imgs_fake_quant.resize(this->max_search_img_num); + if (this->op_name == OP_CONV || this->op_name == OP_FC) + this->weight_bias_requant(gen); + + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, 0); + + /* op run */ + this->node_ops_fp32->run(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32); + this->node_ops_fake_quant->run(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant); + this->activation_requant(this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num, 8, 1, this->output_tensor_fake_quant->scale, this->output_tensor_fake_quant->zero_point); + } + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], this->weight_tensor_fp32->dims[0]); + else + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->print_cosin(this->cosin.data(), i, this->weight_tensor_fp32->dims[0]); + else + this->print_cosin(this->cosin.data(), i, 1); + // fprintf(stderr, "cosin [%s] : %f\n", execidx_nodename[i].c_str(), cosin); + + this->weight_bias_reset(); + this->free_used_layers(i); + + /* save node output */ + this->fp32_out.push_back(this->out_imgs_fp32); + this->fake_quant_out.push_back(this->out_imgs_fake_quant); + } + + return 0; +} + +void QuantTool::print_cosin(double* cosin, int idx, int output_channel) +{ + float avg_cosin = 0; + float avg_num = 0; + for (int c = 0; c < output_channel; c++) + { + if (cosin[c] != 0) + { + avg_cosin += cosin[c]; + avg_num++; + } + } + fprintf(stderr, "cosin %3d %4d avg %0.6f ### ", idx, output_channel, avg_cosin / avg_num); + for (int c = 0; c < output_channel; c++) + { + fprintf(stderr, "%0.6f ", cosin[c]); + } + fprintf(stderr, "\n"); +} + +void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom) +{ + float* scale_list = (float*)weight_tensor->scale_list; + int* zero_point_list = (int*)weight_tensor->zp_list; + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + float scale = scale_list[i] * zoom; + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; + } + } + } +} +void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom) +{ + float* scale_list = (float*)weight_tensor->scale_list; + int* zero_point_list = (int*)weight_tensor->zp_list; + + int elem_perchannel = elem_num / elem_channel; + + float fake_quant_max; + float fake_quant_min; + + if (symmetry == 1) + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = -fake_quant_max; + } + else + { + fake_quant_max = pow(2, bitcount - symmetry) - 1; + fake_quant_min = 0; + } + + int data_idx; + for (int i = 0; i < elem_channel; i++) + { + float scale = 1; + if (zoom[i] > 5) + scale = scale_list[i]; + else + scale = scale_list[i] * zoom[i]; + for (int j = 0; j < elem_perchannel; j++) + { + data_idx = i * elem_perchannel + j; + if (scale_list[i] == 0) + data[data_idx] = 0; + else + { + data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i]; + data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx]; + data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx]; + data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale; + } + } + } +} + +int QuantTool::quant_search() +{ + this->init(); + for (int i = 0; i < this->exec_node_num; i++) + { + this->get_exec_node_message(i); + this->check_for_interlearve(); + + this->out_imgs_fp32.resize(this->max_search_img_num); + this->out_imgs_fake_quant.resize(this->max_search_img_num); + + if (this->op_name == OP_CONV || this->op_name == OP_FC) + { + this->gen_weight_scale(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0]); + this->gen_weight_scale(this->weight_tensor_fp32, this->weight_data_fp32, this->weight_tensor_fp32->elem_num, 8, 1, weight_tensor_fp32->dims[0]); + + std::vector cosin_save(weight_tensor_fake_quant->dims[0], -1); + std::vector zoom_save(weight_tensor_fake_quant->dims[0], -1); + for (int snum = 0; snum < 201; snum = snum + 20) + { + float zoom = 1.3 / 200 * (snum + 1); + // float zoom = 1.0; + /* weight requant */ + if (snum < 200) + this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom); + else + { + this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom_save.data()); + float* buf = (float*)sys_malloc(weight_tensor_fake_quant->dims[0] * 4); + memcpy(buf, zoom_save.data(), weight_tensor_fake_quant->dims[0] * 4); + // printf(" scale3 %f \n",weight_tensor_fp32->scale_list[0]); + for (int bi = 0; bi < weight_tensor_fake_quant->dims[0]; bi++) + { + buf[bi] *= weight_tensor_fp32->scale_list[bi]; + } + // printf(" scale4 %f \n",buf[0]); + // weight_tensor_fake_quant->scale_list = buf; + weight_tensor_fp32->scale_list = buf; + weight_tensor_fp32->quant_param_num = weight_tensor_fp32->dims[0]; + // printf(" scale5 %f \n",weight_tensor_fp32->scale_list[0]); + } + if (interleave_size_fake != 0) + { + int M = weight_tensor_fake_quant->dims[0]; + int K = weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0]; + this->conv_hcl_interleave_pack4_fp32(M, K, weight_data_fake_quant, interleave_buffer_fake_quant); + } + + /* bias requant */ + if (node_fake_quant->ir_node->input_num > 2) + { + struct tensor* input_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[0]]; + struct tensor* bias_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[2]]; + struct tensor* bias_tensor_fp32 = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[2]]; + + bias_size = bias_tensor_fp32->elem_num * bias_tensor_fp32->elem_size; + + bias_data_fp32 = (float*)bias_tensor_fp32->data; + bias_data_fake_quant = (float*)bias_tensor_fake_quant->data; + + this->bias_requant(input_tensor_fake_quant, weight_tensor_fake_quant, bias_tensor_fake_quant, + bias_data_fake_quant, bias_tensor_fake_quant->elem_num, bias_tensor_fake_quant->dims[0]); + } + + /* per image run */ + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, snum); + + /* FP32 op run */ + if (snum == 0) + { + // set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); + node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); + + this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[i] = output_tensor_fp32->name; + } + + /* fake quant op run */ + // set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); + node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); + this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); + } // image number + + output_channel = output_tensor_fp32->dims[1]; + + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); + else + this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + + // this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel); + + for (int cosi = 0; cosi < output_channel; cosi++) + { + if (cosin[cosi] > cosin_save[cosi]) + { + cosin_save[cosi] = cosin[cosi]; + zoom_save[cosi] = zoom; + } + } + if (snum == 200) + { + if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1)) + this->print_cosin(this->cosin.data(), i, output_channel); + else + this->print_cosin(this->cosin.data(), i, 1); + } + + if (op_name == OP_CONV || op_name == OP_FC) + { + memcpy(weight_data_fake_quant, weight_data_fp32, weight_size); + // this->weight_correction(weight_data_fp32, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, this->bitcount, this->symmetry, weight_tensor_fake_quant->dims[0]); + memcpy(interleave_buffer_fake_quant, interleave_buffer_fp32, interleave_size_fake); + if (node_fake_quant->ir_node->input_num > 2) + { + memcpy(bias_data_fake_quant, bias_data_fp32, bias_size); + } + } + } + } + else + { + /* per image run */ + for (int imgi = 0; imgi < this->max_search_img_num; imgi++) + { + this->set_node_input_output_tensor(i, imgi, 0); + + // set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size); + node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32); + + /* fake quant op run */ + // set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size); + node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant); + this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point); + + this->execidx_elemnum[i] = output_tensor_fp32->elem_num; //exec idx --> output elem num + this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size + this->execidx_nodename[i] = output_tensor_fp32->name; + } + this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1); + this->print_cosin(this->cosin.data(), i, 1); + this->execidx_loss[i] = cosin; + } + + this->free_used_layers(i); + + /* save node output */ + this->fp32_out.push_back(this->out_imgs_fp32); + this->fake_quant_out.push_back(this->out_imgs_fake_quant); + } // node number + // fprintf(stderr, "--------------------------------------\n"); + + if (!save_graph(graphn_fp32, "save_i8_eq.tmfile")) + { + fprintf(stderr, "save graph failed.\n"); + return -1; + } + + return 0; +} diff --git a/tools/quantize/quant_save_graph.cpp b/tools/quantize/quant_save_graph.cpp index 410db6be3..0ff72f180 100644 --- a/tools/quantize/quant_save_graph.cpp +++ b/tools/quantize/quant_save_graph.cpp @@ -505,6 +505,11 @@ int save_graph_i8_perchannel(const char* model_file, const char* scale_file, con if (internal) { // TODO + for (int ch = 0; ch < channel_num; ch++) + { + weight_scale_list[ch] = weight_tensor->scale_list[ch]; + weight_zp_list[ch] = 0; + } } else { diff --git a/tools/quantize/quant_tool.hpp b/tools/quantize/quant_tool.hpp index cc8d9ae13..99b43beba 100644 --- a/tools/quantize/quant_tool.hpp +++ b/tools/quantize/quant_tool.hpp @@ -22,9 +22,14 @@ * Author: hhchen@openailab.com */ +#pragma once + #include #include #include +#include +#include +#include extern "C" { #include "api/c_api.h" @@ -34,11 +39,40 @@ extern "C" { #include "graph/tensor.h" #include "utility/sys_port.h" #include "utility/utils.h" +#include "utility/log.h" +#include "utility/vector.h" + +#include "../source/device/cpu/cpu_node.h" +#include "../source/device/cpu/cpu_graph.h" + +#include "convolution_param.h" +#include "fc_param.h" +#include "pooling_param.h" +#include "relu_param.h" } +#include "quant_utils.hpp" +#include "quant_save_graph.hpp" + +typedef std::unordered_map dict_str2int; +typedef std::unordered_map dict_str2float; +typedef std::unordered_map dict_uint2uint; +typedef std::unordered_map > dict_uint2vecuint; +typedef std::unordered_map dict_uint2str; +typedef std::unordered_map > dict_uint2doublex; + #define ALGORITHM_MIN_MAX 0 #define ALGORITHM_KL 1 #define ALGORITHM_ACIQ 2 +#define ALGORITHM_DFQ 3 +#define ALGORITHM_MM_EQ 4 + +struct node_graph +{ + int pass; + std::vector input_node_list; + std::vector output_node_list; +}; class QuantTool { @@ -46,7 +80,41 @@ class QuantTool QuantTool(); ~QuantTool(); + int init(); int activation_quant_tool(); + int assess_quant_loss(int gen); + int quant_search(); + int data_free_quant(); + +private: + void recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t, + dict_str2int& layer_used, dict_str2float& layer_scale, + dict_str2float& layer_zeropoint, dict_str2int& layer_pass); + + struct exec_graph* get_exec_graph(struct graph* graphn); + void load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc); + int prerun_for_get_ir_tensor(void* graph, struct options opt); + void check_for_free(); + + void check_for_interlearve(); + void weight_bias_requant(int search); + void conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t); + void activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point = 0); + void weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel); + void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom); + void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom); + void bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor, + float* data, int elem_num, int elem_channel); + void set_node_input_output_tensor(int idx, int imgi, int snum); + double cosin_similarity(std::vector* in_a, std::vector* in_b, uint32_t imgs_num, uint32_t output_num); + double cosin_similarity(std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num); + void cosin_similarity(std::vector& cosin, std::vector >& in_a, std::vector >& in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel); // cosin dis perchannel + void weight_bias_reset(); + void free_used_layers(int idx); + void gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel); + int get_exec_node_message(int exec_node_idx); + + void print_cosin(double* cosin, int idx, int output_channel); public: struct options opt; @@ -70,4 +138,72 @@ class QuantTool int focus; // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0) int inplace; // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip int algorithm_type; // the type of quant algorithm(0:min-max, 1:kl, default is 0) + bool evaluate; // evaluate quantitative losses + +private: // system variable + dict_uint2uint ir_exec; + dict_uint2uint exec_ir; + dict_uint2vecuint dict_free; + dict_uint2uint execidx_elemnum; + dict_uint2uint execidx_elemsize; + dict_uint2str execidx_nodename; + dict_uint2doublex execidx_loss; + + int max_search_img_num; + + std::vector cosin; + +private: // basic message + int img_size; + double cosin_max; + float scale_acc; + +private: // ir graph variable + std::vector > > fp32_out; + std::vector > > fake_quant_out; + std::vector > input_datas_fp32; + std::vector > input_datas_fake_quant; + std::vector > out_imgs_fp32; + std::vector > out_imgs_fake_quant; + + struct graph* graphn_fp32; + struct graph* graphn_fake_quant; + struct tensor* graph_input_tensor_fp32; + struct tensor* graph_input_tensor_fake_quant; + struct exec_graph* exec_graph_fp32; + struct exec_graph* exec_graph_fake_quant; + int exec_node_num; + +private: // temp variable + uint16_t op_name; + + struct exec_node* node_fp32; + struct exec_node* node_fake_quant; + struct node_ops* node_ops_fp32; + struct node_ops* node_ops_fake_quant; + + struct tensor* input_tensor_fp32; + struct tensor* input_tensor_fake_quant; + struct tensor* weight_tensor_fp32; + struct tensor* weight_tensor_fake_quant; + struct tensor* bias_tensor_fp32; + struct tensor* bias_tensor_fake_quant; + struct tensor* output_tensor_fp32; + struct tensor* output_tensor_fake_quant; + + float* weight_data_fp32; + float* weight_data_fake_quant; + uint32_t weight_size; + float* interleave_buffer_fp32; + float* interleave_buffer_fake_quant; + uint32_t interleave_size_fake; + float* bias_data_fp32; + float* bias_data_fake_quant; + uint32_t bias_size; + uint32_t output_channel; + + struct conv_priv_info* conv_priv_info_fp32; + struct conv_priv_info* conv_priv_info_fake_quant; + struct conv_param* conv_param_fp32; + struct conv_param* conv_param_fake_quant; }; diff --git a/tools/quantize/quant_tool_int8.cpp b/tools/quantize/quant_tool_int8.cpp index 4e318d9e4..3c63ebf0b 100644 --- a/tools/quantize/quant_tool_int8.cpp +++ b/tools/quantize/quant_tool_int8.cpp @@ -66,6 +66,7 @@ QuantTool::QuantTool() this->focus = 0; this->inplace = true; this->algorithm_type = ALGORITHM_MIN_MAX; + this->evaluate = false; } QuantTool::~QuantTool() @@ -163,6 +164,7 @@ int QuantTool::activation_quant_tool() /* init minmax */ std::unordered_map max_activation; std::unordered_map min_activation; + std::unordered_map act_map; uint32_t act_tensor_num = 0; for (int i = 0; i < ir_graph->tensor_num; i++) { @@ -172,6 +174,7 @@ int QuantTool::activation_quant_tool() act_tensor_num++; max_activation[i] = -FLT_MAX; min_activation[i] = FLT_MAX; + act_map[act_tensor_num - 1] = i; } } @@ -213,10 +216,134 @@ int QuantTool::activation_quant_tool() } } } + fprintf(stderr, "\n"); if (this->algorithm_type == ALGORITHM_KL) { - /* todo support */ - fprintf(stderr, "\r\n[****WARNING****]:Step 2 find original calibration kl threshold table NOT support temporarily!\n"); + /* kl process divergence */ + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n"); + std::unordered_map tensor_hist; + std::unordered_map hist_tensor; + std::vector > hist_edge; + std::vector > hist_gram; + + /* second loop, create histgram */ + for (int nums = imgs_list.size() - 1; nums >= 0; nums--) + { + fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num); + + get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus); + + /* run graph */ + if (run_graph(ir_graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + + /* calculate hist */ + uint32_t inum = 0; + for (int i = 0; i < ir_graph->tensor_num; i++) + { + struct tensor* ir_tensor = ir_graph->tensor_list[i]; + if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT) + { + float step_max = std::abs(max_activation[i]); + if (std::abs(min_activation[i]) > step_max) + step_max = std::abs(min_activation[i]); + float step_bin = step_max / 2048.0f; + + std::vector every_edge; + if (nums == imgs_list.size() - 1) + { + for (int j = 0; j < 2048; j++) + { + float edge_float = (step_bin * (j + 0.5f)); + every_edge.push_back(edge_float); + } + hist_edge.push_back(every_edge); + hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, step_max)); + } + else + { + std::vector hist_tmp; + hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, step_max); + for (int j = 0; j < 2048; j++) + { + hist_gram[inum][j] += hist_tmp[j]; + } + } + + tensor_hist[i] = inum; + hist_tensor[inum] = i; + inum++; + } + } + } + + fprintf(stderr, "\n"); + + /* save the calibration file with min-max algorithm with kl divergence */ + int fake_quant_set = 127; + FILE* fp_kl = fopen("table_kl.scale", "wb"); + for (int i = 0; i < act_tensor_num; i++) + { + struct tensor* t = ir_graph->tensor_list[act_map[i]]; + int threshold_bin = threshold_distribution(hist_gram[i], fake_quant_set + 1); + fprintf(stderr, " threshold_bin %d \n", threshold_bin); + + float act_scale = hist_edge[i][threshold_bin] / fake_quant_set; + int act_zero_point = 0; + + /* the scale of softmax always is scale = 1 / 127.f */ + for (int j = 0; j < ir_graph->node_num; j++) + { + struct node* noden = ir_graph->node_list[j]; + struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + + if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR)) + continue; + + std::string tmp_op_name = get_op_name_from_type(noden->op.type); + std::string cur_name = t->name; + std::string tmp_name = tensor_tmp->name; + + if ((cur_name == tmp_name) && tmp_op_name == "Softmax") + { + act_scale = 1 / 127.f; + act_zero_point = 0; + break; + } + } + + /* the scale of eltwise */ + for (int j = 0; j < ir_graph->node_num; j++) + { + struct node* noden = ir_graph->node_list[j]; + std::string tmp_op_name = get_op_name_from_type(noden->op.type); + if (tmp_op_name == "Eltwise") + { + struct tensor* tensor_in0 = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]); + struct tensor* tensor_in1 = get_ir_graph_tensor(ir_graph, noden->input_tensors[1]); + struct tensor* tensor_out = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]); + + std::string cur_name = t->name; + std::string tmp_name0 = tensor_in0->name; + std::string tmp_name1 = tensor_in1->name; + + if ((cur_name == tmp_name0 || cur_name == tmp_name1)) + { + act_scale = tensor_out->scale; + break; + } + } + } + + t->scale = act_scale; + t->zero_point = 0; + fprintf(fp_kl, "%s %f %d\n", t->name, act_scale, act_zero_point); + } + fclose(fp_kl); + fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n"); } else if (this->algorithm_type == ALGORITHM_ACIQ) { @@ -304,7 +431,7 @@ int QuantTool::activation_quant_tool() fprintf(stderr, "\r\n[Quant Tools Info]: Step 2, find original calibration minmax threshold table done, output ./table_minmax.scale\n"); } - fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); + // fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num); /* release tengine */ postrun_graph(ir_graph); @@ -343,7 +470,7 @@ int main(int argc, char* argv[]) QuantTool quant_tool; int res; - while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1) + while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:z:t:h")) != -1) { switch (res) { @@ -390,6 +517,9 @@ int main(int argc, char* argv[]) case 'k': quant_tool.focus = atoi(optarg); break; + case 'z': + quant_tool.evaluate = atoi(optarg); + break; case 't': quant_tool.num_thread = atoi(optarg); quant_tool.opt.num_thread = atoi(optarg); @@ -444,35 +574,100 @@ int main(int argc, char* argv[]) fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF"); fprintf(stderr, "Thread num : %d\n\n", quant_tool.num_thread); - /* using 3rd calibration table file */ - if (quant_tool.scale_file.empty()) + switch (quant_tool.algorithm_type) + { + case ALGORITHM_MIN_MAX: { - /* select algorithm */ - if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX) + if (quant_tool.scale_file.empty()) { quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); } - else if (quant_tool.algorithm_type == ALGORITHM_KL) + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_KL: + { + if (quant_tool.scale_file.empty()) { quant_tool.scale_file = "table_kl.scale"; + quant_tool.activation_quant_tool(); } - else if (quant_tool.algorithm_type == ALGORITHM_ACIQ) + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_ACIQ: + { + if (quant_tool.scale_file.empty()) { quant_tool.scale_file = "table_aciq.scale"; + quant_tool.activation_quant_tool(); } - else + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_DFQ: + { + quant_tool.data_free_quant(); + quant_tool.model_file = "test_dfq_fp32.tmfile"; + if (quant_tool.scale_file.empty()) { - fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n"); quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); } - - /* quantize activation */ - quant_tool.activation_quant_tool(); + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + break; + } + case ALGORITHM_MM_EQ: + { + if (quant_tool.scale_file.empty()) + { + quant_tool.scale_file = "table_minmax.scale"; + quant_tool.activation_quant_tool(); + } + /* Evaluate quantitative losses */ + if (quant_tool.evaluate) + { + fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n"); + quant_tool.assess_quant_loss(0); + } + /* Enable EQ search */ + fprintf(stderr, "[Quant Tools Info]: Step Search, enable EQ search\n"); + quant_tool.quant_search(); + quant_tool.model_file = "save_i8_eq.tmfile"; + save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, true); + break; + } + default: + { + fprintf(stderr, "Unsupported quantization type ... \n"); + break; + } } - - /* quantize weight/bias and save into int8 tmfile */ - fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str()); - save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false); fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n"); diff --git a/tools/quantize/quant_utils.cpp b/tools/quantize/quant_utils.cpp index 0f60d3838..f938f3617 100644 --- a/tools/quantize/quant_utils.cpp +++ b/tools/quantize/quant_utils.cpp @@ -77,7 +77,7 @@ void split(float* array, char* str, const char* del) } void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean, - const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0) + const float* scale, int sw_RGB = 1, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0) { /* only for yolov5s */ if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0) @@ -411,6 +411,22 @@ std::vector histCount(float* data, uint32_t elem_num, float max_val, f return hist; } +std::vector histCount(float* data, uint32_t elem_num, float abs_max) +{ + float bin_scale = abs_max / 2047.f; + int bin_zp = 0; + std::vector hist(2048); + for (int i = 0; i < elem_num; i++) + { + if (data[i] != 0) + { + uint32_t hist_idx = round(std::abs(data[i]) / bin_scale); + hist[hist_idx]++; + } + } + return hist; +} + float compute_kl_divergence(std::vector& dist_a, std::vector& dist_b) { const size_t length = dist_a.size(); diff --git a/tools/quantize/quant_utils.hpp b/tools/quantize/quant_utils.hpp index 4ad636763..df529181a 100644 --- a/tools/quantize/quant_utils.hpp +++ b/tools/quantize/quant_utils.hpp @@ -40,6 +40,7 @@ void get_input_data_cv(const char* image_file, float* input_data, int img_c, int void readFileList(std::string basePath, std::vector& imgs); std::vector histCount(float* data, uint32_t elem_num, float max_val, float min_val); +std::vector histCount(float* data, uint32_t elem_num, float abs_max); float compute_kl_divergence(std::vector& dist_a, std::vector& dist_b);