diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index ba8d3d961..b8504e500 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -30,6 +30,8 @@ IF (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
         ADD_EXECUTABLE(
             ${name}
             ./quant_save_graph.cpp
+            ./algorithm/quant_dfq.cpp
+            ./algorithm/quant_eq.cpp
             ./quant_utils.cpp
             ../save_graph/save_graph.cpp
             ../save_graph/tm2_op_save.cpp
diff --git a/tools/quantize/README.md b/tools/quantize/README.md
index e0e1d7547..968dc1441 100644
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@@ -76,7 +76,7 @@ Status      : int8, per-channel, symmetric
 Before use the quant tool, **you need Float32 tmfile and Calibration Dataset**, the image num of calibration dataset we suggest to use 500-1000.
 
 ```
-$ .quant_tool_int8  -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_int8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017
+$ .quant_tool_int8  -m ./mobilenet_fp32.tmfile -i ./dataset -o ./mobilenet_int8.tmfile -g 3,224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017 -z 1
 
 ---- Tengine Post Training Quantization Tool ----
 
@@ -111,6 +111,38 @@ Thread num  : 1
 [Quant Tools Info]: Step 4, quantize activation tensor done.
 [Quant Tools Info]: Step 5, quantize weight tensor done.
 [Quant Tools Info]: Step 6, save Int8 tmfile done, ./mobilenet_int8.tmfile
+[Quant Tools Info]: Step Evaluate, evaluate quantitative losses
+cosin   0    32  avg  0.995317  ### 0.000000 0.953895 0.998249 0.969256 ...
+cosin   1    32  avg  0.982403  ### 0.000000 0.902383 0.964436 0.873998 ...
+cosin   2    64  avg  0.976753  ### 0.952854 0.932301 0.982766 0.958503 ...
+cosin   3    64  avg  0.981889  ### 0.976637 0.981754 0.987276 0.970671 ...
+cosin   4   128  avg  0.979728  ### 0.993999 0.991858 0.990438 0.992766 ...
+cosin   5   128  avg  0.970351  ### 0.772556 0.989541 0.986996 0.989563 ...
+cosin   6   128  avg  0.954545  ### 0.950125 0.922964 0.946804 0.972852 ...
+cosin   7   128  avg  0.977192  ### 0.994728 0.972071 0.995353 0.992700 ...
+cosin   8   256  avg  0.977426  ### 0.968429 0.991248 0.991274 0.994450 ...
+cosin   9   256  avg  0.962224  ### 0.985255 0.969171 0.958762 0.967461 ...
+cosin  10   256  avg  0.954253  ### 0.984353 0.935643 0.656188 0.929778 ...
+cosin  11   256  avg  0.971987  ### 0.997596 0.967681 0.476525 0.999115 ...
+cosin  12   512  avg  0.972861  ### 0.968920 0.905907 0.993918 0.622953 ...
+cosin  13   512  avg  0.959161  ### 0.935686 0.000000 0.642560 0.994388 ...
+cosin  14   512  avg  0.963903  ### 0.979613 0.957169 0.976440 0.902512 ...
+cosin  15   512  avg  0.963226  ### 0.977065 0.965819 0.998149 0.905297 ...
+cosin  16   512  avg  0.960935  ### 0.861674 0.972926 0.950579 0.987609 ...
+cosin  17   512  avg  0.961057  ### 0.738472 0.987884 0.999124 0.995397 ...
+cosin  18   512  avg  0.960127  ### 0.935455 0.968909 0.970831 0.981240 ...
+cosin  19   512  avg  0.963755  ### 0.972628 0.992305 0.999518 0.799737 ...
+cosin  20   512  avg  0.949364  ### 0.922776 0.896038 0.945079 0.971338 ...
+cosin  21   512  avg  0.961256  ### 0.902256 0.896438 0.923361 0.973974 ...
+cosin  22   512  avg  0.946552  ### 0.963806 0.982075 0.878965 0.929992 ...
+cosin  23   512  avg  0.953677  ### 0.953880 0.996364 0.936540 0.930796 ...
+cosin  24  1024  avg  0.941197  ### 0.000000 0.992507 1.000000 0.994460 ...
+cosin  25  1024  avg  0.973546  ### 1.000000 0.889181 0.000000 0.998084 ...
+cosin  26  1024  avg  0.869351  ### 0.522966 0.000000 0.987009 0.000000 ...
+cosin  27     1  avg  0.974982  ### 0.974982 
+cosin  28     1  avg  0.974982  ### 0.974982 
+cosin  29     1  avg  0.974982  ### 0.974982 
+cosin  30     1  avg  0.978486  ### 0.978486 
 
 ---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\(^0^)/ ----
 ```
diff --git a/tools/quantize/algorithm/quant_dfq.cpp b/tools/quantize/algorithm/quant_dfq.cpp
new file mode 100644
index 000000000..7b17ddbbf
--- /dev/null
+++ b/tools/quantize/algorithm/quant_dfq.cpp
@@ -0,0 +1,572 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include "../quant_tool.hpp"
+
+//int QuantTool::data_free_quant(const char* model_file, const char* image_dir,
+//                    int img_c, int img_h, int img_w, const float* mean, const float* scale,
+//                    int num_thread, int sw_RGB, int center_crop)
+int QuantTool::data_free_quant()
+{
+    int letterbox = 0;
+    int loop_count = 1;
+    const char* image_file = nullptr;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = num_thread;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+
+    //    /* inital tengine */
+    //    if (init_tengine() != 0)
+    //    {
+    //        fprintf(stderr, "Initial tengine failed.\n");
+    //        return -1;
+    //    }
+    //    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
+
+    /* create graph, load tengine model xxx.tmfile */
+    graph_t graph = create_graph(NULL, "tengine", model_file.c_str());
+    if (NULL == graph)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        fprintf(stderr, "errno: %d \n", get_tengine_errno());
+        return -1;
+    }
+
+    struct graph* graphn = (struct graph*)graph;
+    struct node_graph* node_proto = (struct node_graph*)sys_malloc(sizeof(struct node_graph) * graphn->node_num);
+
+    for (int i = 0; i < graphn->node_num; i++)
+    {
+        struct node* n = graphn->node_list[i]; //ir node
+        const uint16_t node_idx = n->index;    //node idx
+        auto op_type = n->op.type;
+        const char* layer_name = n->name; //layer name
+
+        const uint16_t input_num = n->input_num;   //input num
+        const uint16_t output_num = n->output_num; //output num
+
+        node_proto[i].pass = 0;
+        //        node_proto[i].input_node_list = create_vector(sizeof(uint16_t), NULL);
+        //        node_proto[i].output_node_list = create_vector(sizeof(uint16_t), NULL);
+
+        for (int j = 0; j < input_num; j++)
+        {
+            struct tensor* input_tensor = get_ir_graph_tensor(graphn, n->input_tensors[j]);
+            const char* input_tensor_name = input_tensor->name;
+            uint8_t dim_num = input_tensor->dim_num;
+
+            if (input_tensor->producer >= 0)
+            {
+                struct node* node = graphn->node_list[input_tensor->producer];
+                node_proto[i].input_node_list.push_back(node->index);
+                node_proto[node->index].output_node_list.push_back(i);
+            }
+            if (OP_CONV == op_type || OP_FC == op_type)
+            {
+                break;
+            }
+        }
+    }
+
+    for (int i = 0; i < graphn->node_num; i++)
+    {
+        struct node* n = graphn->node_list[i]; //ir node
+        const uint16_t node_idx = n->index;    //node idx
+        auto op_type = n->op.type;
+        const char* layer_name = n->name; //layer name
+        if (op_type != NULL)
+        {
+            if (OP_CONV != op_type && OP_FC != op_type)
+            {
+                if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1)
+                {
+                    uint16_t node_input_id = node_proto[i].input_node_list[0];
+                    uint16_t node_output_id = node_proto[i].output_node_list[0];
+                    if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1)
+                    {
+                        node_proto[i].input_node_list.erase(node_proto[i].input_node_list.begin() + 0);
+                        node_proto[i].output_node_list.erase(node_proto[i].output_node_list.begin() + 0);
+
+                        node_proto[node_input_id].output_node_list.erase(node_proto[node_input_id].output_node_list.begin() + 0);
+                        node_proto[node_input_id].output_node_list.push_back(node_output_id);
+
+                        node_proto[node_output_id].input_node_list.erase(node_proto[node_output_id].input_node_list.begin() + 0);
+                        node_proto[node_output_id].input_node_list.push_back(node_input_id);
+                    }
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < graphn->node_num; i++)
+    {
+        struct node* n = graphn->node_list[i]; //ir node
+        const uint16_t node_idx = n->index;    //node idx
+        auto op_name = n->op.type;
+        const char* layer_name = n->name; //layer name
+
+        const uint16_t input_num = n->input_num;   //input num
+        const uint16_t output_num = n->output_num; //output num
+
+        if (op_name != NULL)
+        {
+            if (OP_CONV == op_name)
+            {
+                // DW_Conv && Direct_Conv
+                struct conv_param* conv_param = (struct conv_param*)n->op.param_mem;
+                if (conv_param->group == conv_param->output_channel)
+                {
+                    //                    printf("    #### DW Conv ####\n");
+                    if (node_proto[i].input_node_list.size() == 1 && node_proto[i].output_node_list.size() == 1)
+                    {
+                        uint16_t node_input_id = node_proto[i].input_node_list[0];
+                        uint16_t node_output_id = node_proto[i].output_node_list[0];
+                        auto op_name0 = graphn->node_list[node_input_id]->op.type;
+                        auto op_name2 = graphn->node_list[node_input_id]->op.type;
+
+                        if (node_proto[node_input_id].output_node_list.size() == 1 && node_proto[node_output_id].input_node_list.size() == 1 && OP_CONV == op_name0 && OP_CONV == op_name2)
+                        {
+                            node_proto[i].pass = 1;              //layer1
+                            node_proto[node_input_id].pass = 1;  //layer0
+                            node_proto[node_output_id].pass = 1; //layer2
+
+                            // layer0 min/max range
+                            struct node* nodeP = graphn->node_list[node_input_id];
+                            struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                            uint16_t dims0 = input_tensor->dims[0];
+                            uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+
+                            std::vector<float> layer0_max(dims0, 0.0f);
+                            std::vector<float> layer0_min(dims0, 0.0f);
+                            std::vector<float> layer0_range(dims0, 0.0f);
+
+                            float* data_layer0 = (float*)input_tensor->data;
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                for (int d1 = 0; d1 < dims123; d1++)
+                                {
+                                    if (data_layer0[dims123 * d0 + d1] > layer0_max[d0])
+                                        layer0_max[d0] = data_layer0[dims123 * d0 + d1];
+                                    if (data_layer0[dims123 * d0 + d1] < layer0_max[d0])
+                                        layer0_min[d0] = data_layer0[dims123 * d0 + d1];
+                                }
+                            }
+                            //                            printf("### %d ###\n",dims0);
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                layer0_range[d0] = layer0_max[d0] - layer0_min[d0];
+                            }
+
+                            // layer1 min/max range
+                            nodeP = graphn->node_list[i];
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                            dims0 = input_tensor->dims[0];
+                            dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+
+                            std::vector<float> layer1_max(dims0, 0.0f);
+                            std::vector<float> layer1_min(dims0, 0.0f);
+                            std::vector<float> layer1_range(dims0, 0.0f);
+
+                            float* data_layer1 = (float*)input_tensor->data;
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                for (int d1 = 0; d1 < dims123; d1++)
+                                {
+                                    if (data_layer1[dims123 * d0 + d1] > layer1_max[d0])
+                                        layer1_max[d0] = data_layer1[dims123 * d0 + d1];
+                                    if (data_layer1[dims123 * d0 + d1] < layer1_max[d0])
+                                        layer1_min[d0] = data_layer1[dims123 * d0 + d1];
+                                }
+                            }
+                            //                            printf("### %d ###\n",dims0);
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                layer1_range[d0] = layer1_max[d0] - layer1_min[d0];
+                            }
+
+                            // layer2 min/max range
+                            nodeP = graphn->node_list[node_output_id];
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                            dims0 = input_tensor->dims[0];
+                            uint16_t dims1 = input_tensor->dims[1];
+                            uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3];
+
+                            std::vector<float> layer2_max(dims0, 0.0f);
+                            std::vector<float> layer2_min(dims0, 0.0f);
+                            std::vector<float> layer2_range(dims0, 0.0f);
+
+                            float* data_layer2 = (float*)input_tensor->data;
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                for (int d1 = 0; d1 < dims1; d1++)
+                                {
+                                    for (int d2 = 0; d2 < dims23; d2++)
+                                    {
+                                        if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer2_max[d1])
+                                        {
+                                            layer2_max[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2];
+                                        }
+                                        if (data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer2_min[d1])
+                                        {
+                                            layer2_min[d1] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2];
+                                        }
+                                    }
+                                }
+                            }
+                            //                            printf("### %d ###\n",dims1);
+                            for (int d1 = 0; d1 < dims1; d1++)
+                            {
+                                layer2_range[d1] = layer2_max[d1] - layer2_min[d1];
+                            }
+
+                            //////////////////////////////////////////////////////////////////////////////////
+
+                            // layer ops sqrt
+                            float ops_range[dims1];
+                            for (int ops = 0; ops < dims1; ops++)
+                            {
+                                ops_range[ops] = pow(layer0_range[ops] * layer1_range[ops] * layer2_range[ops], 1.0 / 3);
+                            }
+
+                            float S01[dims1];
+                            float S01_F[dims1];
+                            float S12[dims1];
+                            float S12_F[dims1];
+                            for (int ops = 0; ops < dims1; ops++)
+                            {
+                                if (ops_range[ops] == 0)
+                                {
+                                    S01[ops] = 0.0;
+                                    S12_F[ops] = 0.0;
+                                }
+                                else
+                                {
+                                    S01[ops] = layer0_range[ops] / ops_range[ops];
+                                    S12_F[ops] = layer2_range[ops] / ops_range[ops];
+                                }
+                                if (layer0_range[ops] == 0)
+                                    S01_F[ops] = 0.0;
+                                else
+                                    S01_F[ops] = ops_range[ops] / layer0_range[ops];
+                                if (layer2_range[ops] == 0)
+                                    S12[ops] = 0.0;
+                                else
+                                    S12[ops] = ops_range[ops] / layer2_range[ops];
+                            }
+                            //////////////////////////////////////////////////////////////////////////////////
+
+                            // layer0 output
+                            nodeP = graphn->node_list[node_input_id];
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                            dims0 = input_tensor->dims[0];
+                            dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                for (int d1 = 0; d1 < dims123; d1++)
+                                {
+                                    data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0];
+                                }
+                            }
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]);
+                            dims0 = input_tensor->dims[0];
+                            float* data_layer0_bias = (float*)sys_malloc(sizeof(float) * dims0);
+                            data_layer0_bias = (float*)input_tensor->data;
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0];
+                            }
+
+                            // layer1 output
+                            nodeP = graphn->node_list[i];
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                            dims0 = input_tensor->dims[0];
+                            dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                for (int d1 = 0; d1 < dims123; d1++)
+                                {
+                                    data_layer1[dims123 * d0 + d1] = data_layer1[dims123 * d0 + d1] * S01[d0] * S12_F[d0];
+                                }
+                            }
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]);
+                            dims0 = input_tensor->dims[0];
+                            float* data_layer1_bias = (float*)input_tensor->data;
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                data_layer1_bias[d0] = data_layer1_bias[d0] * S12_F[d0];
+                            }
+
+                            // layer2 output
+                            nodeP = graphn->node_list[node_output_id];
+                            input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                            dims0 = input_tensor->dims[0];
+                            dims1 = input_tensor->dims[1];
+                            dims23 = input_tensor->dims[2] * input_tensor->dims[3];
+                            for (int d0 = 0; d0 < dims0; d0++)
+                            {
+                                for (int d1 = 0; d1 < dims1; d1++)
+                                {
+                                    for (int d2 = 0; d2 < dims23; d2++)
+                                    {
+                                        data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer2[dims1 * dims23 * d0 + dims23 * d1 + d2] * S12[d1];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    //                    printf("    #### Direct Conv ####\n");
+                    if (node_proto[i].pass == 0)
+                    {
+                        if (node_proto[i].input_node_list.size() == 1)
+                        {
+                            uint16_t node_input_id = node_proto[i].input_node_list[0];
+                            if (graphn->node_list[node_input_id]->input_num > 0)
+                            {
+                                auto op_name0 = graphn->node_list[node_input_id]->op.type;
+
+                                if (node_proto[node_input_id].output_node_list.size() == 1 && op_name0 == OP_CONV)
+                                {
+                                    node_proto[i].pass = 1;             //layer1
+                                    node_proto[node_input_id].pass = 1; //layer0
+
+                                    // layer0 min/max range
+                                    struct node* nodeP = graphn->node_list[node_input_id];
+                                    struct tensor* input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                                    uint16_t dims0 = input_tensor->dims[0];
+                                    uint16_t dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+
+                                    std::vector<float> layer0_max(dims0, 0.0f);
+                                    std::vector<float> layer0_min(dims0, 0.0f);
+                                    std::vector<float> layer0_range(dims0, 0.0f);
+
+                                    float* data_layer0 = (float*)input_tensor->data;
+                                    for (int d0 = 0; d0 < dims0; d0++)
+                                    {
+                                        for (int d1 = 0; d1 < dims123; d1++)
+                                        {
+                                            if (data_layer0[dims123 * d0 + d1] > layer0_max[d0])
+                                                layer0_max[d0] = data_layer0[dims123 * d0 + d1];
+                                            if (data_layer0[dims123 * d0 + d1] < layer0_max[d0])
+                                                layer0_min[d0] = data_layer0[dims123 * d0 + d1];
+                                        }
+                                    }
+                                    //                                    printf("### %d ###\n",dims0);
+                                    for (int d0 = 0; d0 < dims0; d0++)
+                                    {
+                                        layer0_range[d0] = layer0_max[d0] - layer0_min[d0];
+                                    }
+
+                                    // layer1 min/max range
+                                    nodeP = graphn->node_list[i];
+                                    input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                                    dims0 = input_tensor->dims[0];
+                                    uint16_t dims1 = input_tensor->dims[1];
+                                    uint16_t dims23 = input_tensor->dims[2] * input_tensor->dims[3];
+
+                                    std::vector<float> layer1_max(dims0, 0.0f);
+                                    std::vector<float> layer1_min(dims0, 0.0f);
+                                    std::vector<float> layer1_range(dims0, 0.0f);
+
+                                    float* data_layer1 = (float*)input_tensor->data;
+                                    for (int d0 = 0; d0 < dims0; d0++)
+                                    {
+                                        for (int d1 = 0; d1 < dims1; d1++)
+                                        {
+                                            for (int d2 = 0; d2 < dims23; d2++)
+                                            {
+                                                if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] > layer1_max[d1])
+                                                {
+                                                    layer1_max[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2];
+                                                }
+                                                if (data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] < layer1_min[d1])
+                                                {
+                                                    layer1_min[d1] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2];
+                                                }
+                                            }
+                                        }
+                                    }
+                                    //                                    printf("### %d ###\n",dims1);
+                                    for (int d0 = 0; d0 < dims1; d0++)
+                                    {
+                                        layer1_range[d0] = layer1_max[d0] - layer1_min[d0];
+                                    }
+
+                                    //////////////////////////////////////////////////////////////////////////////////
+
+                                    // layer ops sqrt
+                                    float ops_range[dims1];
+                                    for (int ops = 0; ops < dims1; ops++)
+                                    {
+                                        ops_range[ops] = sqrt(layer0_range[ops] * layer1_range[ops]);
+                                    }
+
+                                    float S01[dims1];
+                                    float S01_F[dims1];
+                                    for (int ops = 0; ops < dims1; ops++)
+                                    {
+                                        if (ops_range[ops] == 0)
+                                        {
+                                            S01[ops] = 0.0;
+                                        }
+                                        else
+                                        {
+                                            S01[ops] = layer0_range[ops] / ops_range[ops];
+                                        }
+                                        if (layer0_range[ops] == 0)
+                                            S01_F[ops] = 0.0;
+                                        else
+                                            S01_F[ops] = ops_range[ops] / layer0_range[ops];
+                                    }
+                                    //////////////////////////////////////////////////////////////////////////////////
+                                    // layer0 output
+                                    nodeP = graphn->node_list[node_input_id];
+                                    input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                                    dims0 = input_tensor->dims[0];
+                                    dims123 = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+                                    for (int d0 = 0; d0 < dims0; d0++)
+                                    {
+                                        for (int d1 = 0; d1 < dims123; d1++)
+                                        {
+                                            data_layer0[dims123 * d0 + d1] = data_layer0[dims123 * d0 + d1] * S01_F[d0];
+                                        }
+                                    }
+                                    input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[2]);
+                                    dims0 = input_tensor->dims[0];
+                                    float* data_layer0_bias = (float*)sys_malloc(sizeof(float) * dims0);
+                                    data_layer0_bias = (float*)input_tensor->data;
+                                    for (int d0 = 0; d0 < dims0; d0++)
+                                    {
+                                        data_layer0_bias[d0] = data_layer0_bias[d0] * S01_F[d0];
+                                    }
+
+                                    // layer1 output
+                                    nodeP = graphn->node_list[i];
+                                    input_tensor = get_ir_graph_tensor(graphn, nodeP->input_tensors[1]);
+                                    dims0 = input_tensor->dims[0];
+                                    dims1 = input_tensor->dims[1];
+                                    dims23 = input_tensor->dims[2] * input_tensor->dims[3];
+                                    for (int d0 = 0; d0 < dims0; d0++)
+                                    {
+                                        for (int d1 = 0; d1 < dims1; d1++)
+                                        {
+                                            for (int d2 = 0; d2 < dims23; d2++)
+                                            {
+                                                data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] = data_layer1[dims1 * dims23 * d0 + dims23 * d1 + d2] * S01[d1];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (!save_graph(graph, "test_dfq_fp32.tmfile"))
+    {
+        fprintf(stderr, "save graph failed.\n");
+        return -1;
+    }
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    int img_size = img_h * img_w * img_c;
+    int dims[] = {1, img_c, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == NULL)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, input_data, img_size * 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    std::vector<std::string> imgs_list;
+    if (image_dir.c_str() != NULL)
+    {
+        readFileList(image_dir, imgs_list);
+    }
+    else
+    {
+        imgs_list.push_back(image_file);
+    }
+    uint32_t img_num = imgs_list.size();
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_input_data_cv(imgs_list[0].c_str(), input_data, img_c, img_h, img_w, mean, scale,
+                      1, 0, 0, 0, 0);
+
+    /* run graph */
+    for (int i = 0; i < loop_count; i++)
+    {
+        double start = get_current_time();
+        if (run_graph(graph, 1) < 0)
+        {
+            fprintf(stderr, "Run graph failed\n");
+            return -1;
+        }
+    }
+
+    /* get the result of classification */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+    float* output_data = (float*)get_tensor_buffer(output_tensor);
+    int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    //    printf("out put data %f %d \n",output_data[0], output_size);
+    fprintf(stderr, "--------------------------------------\n");
+
+    /* release tengine */
+    free(input_data);
+    postrun_graph(graph);
+    destroy_graph(graph);
+    //    release_tengine();
+
+    return 0;
+}
diff --git a/tools/quantize/algorithm/quant_eq.cpp b/tools/quantize/algorithm/quant_eq.cpp
new file mode 100644
index 000000000..b8e80fe6b
--- /dev/null
+++ b/tools/quantize/algorithm/quant_eq.cpp
@@ -0,0 +1,1283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include "../quant_tool.hpp"
+
+int QuantTool::init()
+{
+    // ir graph variable
+    this->fp32_out.clear();
+    this->fake_quant_out.clear();
+
+    /* load fp32 graph and fake quant graph */
+    this->graphn_fp32 = (struct graph*)create_graph(nullptr, "tengine", this->model_file.c_str());
+    this->graphn_fake_quant = (struct graph*)create_graph(nullptr, "tengine", this->model_file.c_str());
+
+    if (this->graphn_fp32 == nullptr || this->graphn_fake_quant == nullptr)
+    {
+        fprintf(stderr, "Create graph failed.\n");
+        fprintf(stderr, "errno: %d \n", get_tengine_errno());
+        return -1;
+    }
+
+    /* load activation scale to ir_tensor */
+    this->load_activation_scale(this->graphn_fp32, this->scale_file.c_str(), this->inplace);
+    this->load_activation_scale(this->graphn_fake_quant, this->scale_file.c_str(), this->inplace);
+
+    /* get graph input tensor */
+    this->graph_input_tensor_fp32 = (struct tensor*)get_graph_input_tensor((void*)this->graphn_fp32, 0, 0);
+    this->graph_input_tensor_fake_quant = (struct tensor*)get_graph_input_tensor((void*)this->graphn_fake_quant, 0, 0);
+    if (this->graph_input_tensor_fp32 == nullptr || this->graph_input_tensor_fake_quant == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    /* generate images list */
+    std::vector<std::string> imgs_list;
+    if (!this->image_dir.empty())
+        readFileList(this->image_dir, imgs_list);
+    uint32_t img_num = imgs_list.size();
+
+    this->max_search_img_num = 50;
+    if (img_num < this->max_search_img_num)
+        this->max_search_img_num = img_num;
+
+    //    fprintf(stderr, "# eq dataset num %d\n", this->max_search_img_num);
+
+    /* set the shape, data buffer of input_tensor of the graph */
+    this->img_size = this->img_h * this->img_w * this->img_c;
+    int dims[] = {1, img_c, img_h, img_w}; // nchw
+    float* input_data_fp32 = (float*)malloc(this->img_size * sizeof(float));
+    float* input_data_fake_quant = (float*)malloc(this->img_size * sizeof(float));
+
+    /* prepare process input data, set the data mem to input tensor */
+    float scale_graph_input = this->graph_input_tensor_fake_quant->scale;
+    int zero_point_graph_input = this->graph_input_tensor_fake_quant->zero_point;
+    //    fprintf(stderr, "scale zp %f %d\n", scale_graph_input, zero_point_graph_input);
+
+    this->input_datas_fp32.resize(this->max_search_img_num);
+    this->input_datas_fake_quant.resize(this->max_search_img_num);
+
+    for (int i = 0; i < this->max_search_img_num; i++)
+    {
+        this->input_datas_fp32[i].resize(this->img_size);
+        this->input_datas_fake_quant[i].resize(this->img_size);
+
+        get_input_data_cv(imgs_list[i].c_str(), this->input_datas_fp32[i].data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
+
+        this->input_datas_fake_quant[i] = this->input_datas_fp32[i];
+        this->activation_requant(this->input_datas_fake_quant[i].data(), this->img_size, 8, 1, scale_graph_input,
+                                 zero_point_graph_input);
+    }
+
+    /* set graph input shape */
+    int ret_fp32 = set_tensor_shape(this->graph_input_tensor_fp32, dims, 4);
+    int ret_fake_quant = set_tensor_shape(this->graph_input_tensor_fake_quant, dims, 4);
+    if (ret_fp32 < 0 || ret_fake_quant < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    /* set graph input buffer */
+    ret_fp32 = set_tensor_buffer(this->graph_input_tensor_fp32, input_data_fp32, this->img_size * 4);
+    ret_fake_quant = set_tensor_buffer(this->graph_input_tensor_fake_quant, input_data_fake_quant, this->img_size * 4);
+    if (ret_fp32 < 0 || ret_fake_quant < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread((void*)this->graphn_fp32, this->opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+    ret_fp32 = prerun_graph_multithread((void*)this->graphn_fp32, this->opt);
+    ret_fake_quant = prerun_graph_multithread((void*)this->graphn_fake_quant, this->opt);
+    if (ret_fp32 < 0 || ret_fake_quant < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* get exec graph */
+    this->exec_graph_fp32 = this->get_exec_graph(this->graphn_fp32);
+    this->exec_graph_fake_quant = this->get_exec_graph(this->graphn_fake_quant);
+    this->exec_node_num = get_vector_num(this->exec_graph_fp32->exec_node_list);
+
+    /* ir idx <<<->>> exec idx */
+    for (int i = 0; i < this->exec_node_num; i++)
+    {
+        this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, i);
+        this->node_fake_quant = (struct exec_node*)get_vector_data(this->exec_graph_fake_quant->exec_node_list, i);
+
+        int out_t = node_fp32->ir_node->output_tensors[0];
+        this->ir_exec[graphn_fp32->tensor_list[out_t]->producer] = i; // ir idx --> exec idx
+        this->exec_ir[i] = graphn_fp32->tensor_list[out_t]->producer; // exec idx --> ir idx
+                                                                      //        printf(" %d : %d\n", graphn_fp32->tensor_list[out_t]->producer, i);
+    }
+
+    /* check for free node*/
+    this->check_for_free();
+
+    return 0;
+}
+
+void QuantTool::activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point)
+{
+    //    symmetry = 0;
+    float fake_quant_max;
+    float fake_quant_min;
+
+    if (symmetry == 1)
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = -fake_quant_max;
+    }
+    else
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = 0;
+    }
+
+    for (int i = 0; i < elem_num; i++)
+    {
+        data[i] = round(data[i] / scale) + zero_point;
+        data[i] = data[i] > fake_quant_max ? fake_quant_max : data[i];
+        data[i] = data[i] < fake_quant_min ? fake_quant_min : data[i];
+        data[i] = (data[i] - zero_point) * scale;
+    }
+}
+
+void QuantTool::recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t,
+                                       dict_str2int& layer_used, dict_str2float& layer_scale, dict_str2float& layer_zeropoint, dict_str2int& layer_pass)
+{
+    if (layer_pass[t->name] == 0 && layer_used[t->name] < 2)
+    {
+        t->scale = layer_scale[layer_name];
+        t->zero_point = layer_zeropoint[layer_name];
+        layer_scale[t->name] = layer_scale[layer_name];
+        layer_zeropoint[t->name] = layer_zeropoint[layer_name];
+
+        uint32_t ir_node_idx = t->producer;
+        struct node* t_node = graphn->node_list[ir_node_idx];
+
+        auto op_name = t_node->op.type;
+        bool poolTrue = false;
+        bool reluTrue = false;
+        if (op_name == OP_POOL)
+        {
+            struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem;
+            if (pool_param->pool_method == 0)
+                poolTrue = true;
+        }
+        else if (op_name == OP_RELU)
+        {
+            struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem;
+            if (relu_param->negative_slope == 0.f)
+                reluTrue = true;
+        }
+        if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || poolTrue || reluTrue)
+        {
+            struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]];
+            if (layer_scale[t->name] != 0)
+            {
+                if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3)
+                {
+                    QuantTool::recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale, layer_zeropoint, layer_pass);
+                }
+            }
+        }
+        layer_pass[t->name] = 1;
+    }
+}
+
+struct exec_graph* QuantTool::get_exec_graph(struct graph* graphn)
+{
+    struct subgraph* subgraph = get_ir_graph_subgraph(graphn, 0);
+    struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph;
+
+    return exec_graph;
+}
+
+void QuantTool::check_for_free()
+{
+    dict_uint2uint nodeA2B;
+    for (int i = 0; i < this->exec_node_num; i++)
+    {
+        this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, i);
+        this->op_name = this->node_fp32->ir_node->op.type;
+
+        for (int j = 0; j < this->node_fp32->ir_node->input_num; j++)
+        {
+            struct tensor* t = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[j]];
+            if (t->tensor_type == 1)
+            {
+                uint32_t ir_idx = t->producer;
+                nodeA2B[this->ir_exec[ir_idx]] = i;
+            }
+        }
+    }
+
+    for (auto iter = nodeA2B.begin(); iter != nodeA2B.end(); iter++)
+    {
+        this->dict_free[iter->second].push_back(iter->first);
+        //        printf(" map %d %d\n", iter->first, iter->second);
+    }
+}
+
+void QuantTool::check_for_interlearve()
+{
+    if (this->op_name == OP_CONV || this->op_name == OP_FC)
+    {
+        /* get weight tensor */
+        this->weight_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[1]];
+        this->weight_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[1]];
+        this->weight_size = this->weight_tensor_fp32->elem_num * this->weight_tensor_fp32->elem_size;
+
+        this->weight_data_fp32 = (float*)this->weight_tensor_fp32->data;
+        this->weight_data_fake_quant = (float*)this->weight_tensor_fake_quant->data;
+
+        if (this->op_name == OP_CONV)
+        {
+            this->conv_param_fp32 = (struct conv_param*)this->node_fp32->ir_node->op.param_mem;
+            this->conv_param_fake_quant = (struct conv_param*)this->node_fake_quant->ir_node->op.param_mem;
+
+            if (this->conv_param_fp32->group != this->conv_param_fp32->output_channel)
+            {
+                this->conv_priv_info_fp32 = (struct conv_priv_info*)this->node_fp32->ops_priv;
+                this->conv_priv_info_fake_quant = (struct conv_priv_info*)this->node_fake_quant->ops_priv;
+
+                this->interleave_size_fake = this->conv_priv_info_fp32->interleave_buffer_pack4_size;
+
+                this->interleave_buffer_fp32 = (float*)this->conv_priv_info_fp32->interleave_buffer_pack4;
+                this->interleave_buffer_fake_quant = (float*)this->conv_priv_info_fake_quant->interleave_buffer_pack4;
+            }
+        }
+        else
+            this->interleave_size_fake = 0;
+    }
+}
+
+void QuantTool::weight_bias_requant(int search)
+{
+    /* weight requant */
+    //    printf("### 1.1 this->weight_tensor_fake_quant->scale %f\n",this->weight_tensor_fake_quant->scale);
+    if (0 == search)
+        this->weight_requant(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, this->weight_tensor_fake_quant->dims[0]);
+
+    if (this->interleave_size_fake != 0)
+    {
+        int M = this->weight_tensor_fake_quant->dims[0];
+        int K = this->weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0];
+        this->conv_hcl_interleave_pack4_fp32(M, K, this->weight_data_fake_quant, this->interleave_buffer_fake_quant);
+    }
+
+    /* bias requant */
+    if (this->node_fake_quant->ir_node->input_num > 2)
+    {
+        this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[0]];
+        this->bias_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->input_tensors[2]];
+        this->bias_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->input_tensors[2]];
+        this->bias_size = this->bias_tensor_fp32->elem_num * this->bias_tensor_fp32->elem_size;
+        this->bias_data_fp32 = (float*)this->bias_tensor_fp32->data;
+        this->bias_data_fake_quant = (float*)this->bias_tensor_fake_quant->data;
+        this->bias_requant(this->input_tensor_fake_quant, this->weight_tensor_fake_quant, this->bias_tensor_fake_quant,
+                           this->bias_data_fake_quant, this->bias_tensor_fake_quant->elem_num, this->bias_tensor_fake_quant->dims[0]);
+        //        this->bias_tensor_fp32->scale = this->bias_tensor_fake_quant->scale;
+    }
+}
+
+void QuantTool::set_node_input_output_tensor(int idx, int imgi, int snum)
+{
+    this->out_imgs_fp32[imgi].resize(this->output_tensor_fp32->elem_num);
+    this->out_imgs_fake_quant[imgi].resize(this->output_tensor_fp32->elem_num);
+
+    if (idx == 0)
+    {
+        set_tensor_buffer(this->graph_input_tensor_fp32, this->input_datas_fp32[imgi].data(), this->img_size * 4);
+        set_tensor_buffer(this->graph_input_tensor_fake_quant, this->input_datas_fake_quant[imgi].data(), this->img_size * 4);
+    }
+    else
+    {
+        for (int inputi = 0; inputi < this->node_fp32->ir_node->input_num; inputi++)
+        {
+            uint32_t ir_input_tensor_idx = this->node_fp32->ir_node->input_tensors[inputi];
+            this->input_tensor_fp32 = this->graphn_fp32->tensor_list[ir_input_tensor_idx];
+            this->input_tensor_fake_quant = this->graphn_fake_quant->tensor_list[ir_input_tensor_idx];
+
+            if (this->input_tensor_fp32->tensor_type == 1)
+            {
+                uint32_t ir_node_idx = this->input_tensor_fp32->producer;
+                uint32_t input_size = this->input_tensor_fp32->elem_num * input_tensor_fp32->elem_size;
+
+                uint32_t exec_node_idx = this->ir_exec[ir_node_idx];
+
+                if (imgi == 0 && snum == 0)
+                {
+                    float* buf_fp32 = (float*)sys_malloc(32);
+                    float* buf_fake_quant = (float*)sys_malloc(32);
+
+                    set_tensor_buffer(this->input_tensor_fp32, buf_fp32, input_size);
+                    set_tensor_buffer(this->input_tensor_fake_quant, buf_fake_quant, input_size);
+
+                    set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size);
+                    set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size);
+                }
+                else
+                {
+                    set_tensor_buffer(this->input_tensor_fp32, this->fp32_out[exec_node_idx][imgi].data(), input_size);
+                    set_tensor_buffer(this->input_tensor_fake_quant, this->fake_quant_out[exec_node_idx][imgi].data(), input_size);
+                }
+            } // output tensor
+        }     // node input number
+    }         //  node i > 0
+
+    /* init output buffer */
+    set_tensor_buffer(this->output_tensor_fp32, this->out_imgs_fp32[imgi].data(), this->output_tensor_fp32->elem_num * this->output_tensor_fp32->elem_size);
+    set_tensor_buffer(this->output_tensor_fake_quant, this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num * this->output_tensor_fake_quant->elem_size);
+}
+
+double QuantTool::cosin_similarity(std::vector<std::vector<float> >& in_a, std::vector<std::vector<float> >& in_b, uint32_t imgs_num, uint32_t output_num)
+{
+    double norm_a = 0;
+    double norm_b = 0;
+    double a_b = 0;
+
+    uint32_t fnum = (output_num >> 4) << 4;
+    uint32_t rnum = output_num - fnum;
+
+#if 0 //__AVX__
+
+    float _sumaa0[8] = {0.f};
+    float _sumbb0[8] = {0.f};
+    float _sumaabb0[8] = {0.f};
+    float _sumaa1[8] = {0.f};
+    float _sumbb1[8] = {0.f};
+    float _sumaabb1[8] = {0.f};
+
+    __m256 _suma_o0 = _mm256_set1_ps(0.0);
+    __m256 _sumb_o0 = _mm256_set1_ps(0.0);
+    __m256 _sumab_o0 = _mm256_set1_ps(0.0);
+    __m256 _suma_o1 = _mm256_set1_ps(0.0);
+    __m256 _sumb_o1 = _mm256_set1_ps(0.0);
+    __m256 _sumab_o1 = _mm256_set1_ps(0.0);
+
+    for (int i = 0; i < imgs_num; i++)
+    {
+        const float* in_a_addr = in_a[i].data();
+        const float* in_b_addr = in_b[i].data();
+        for (int j = 0; j < fnum; j=j+32)
+        {
+            __m256 _in_a0 = _mm256_loadu_ps(in_a_addr+j);
+            __m256 _in_b0 = _mm256_loadu_ps(in_b_addr+j);
+            __m256 _in_a1 = _mm256_loadu_ps(in_a_addr+j+8);
+            __m256 _in_b1 = _mm256_loadu_ps(in_b_addr+j+8);
+
+            _suma_o0 = _mm256_fmadd_ps(_in_a0, _in_a0, _suma_o0);
+            _sumb_o0 = _mm256_fmadd_ps(_in_b0, _in_b0, _sumb_o0);
+            _sumab_o0 = _mm256_fmadd_ps(_in_a0, _in_b0, _sumab_o0);
+            _suma_o1 = _mm256_fmadd_ps(_in_a1, _in_a1, _suma_o1);
+            _sumb_o1 = _mm256_fmadd_ps(_in_b1, _in_b1, _sumb_o1);
+            _sumab_o1 = _mm256_fmadd_ps(_in_a1, _in_b1, _sumab_o1);
+        }
+    }
+    _mm256_storeu_ps(_sumaa0, _suma_o0);
+    _mm256_storeu_ps(_sumbb0, _sumb_o0);
+    _mm256_storeu_ps(_sumaabb0, _sumab_o0);
+    _mm256_storeu_ps(_sumaa1, _suma_o1);
+    _mm256_storeu_ps(_sumbb1, _sumb_o1);
+    _mm256_storeu_ps(_sumaabb1, _sumab_o1);
+
+    for (int i = 0; i < 8; i++)
+    {
+        norm_a += _sumaa0[i] + _sumaa1[i];
+        norm_b += _sumbb0[i] + _sumbb1[i];
+        a_b += _sumaabb0[i] + _sumaabb1[i];
+
+    }
+
+#else // normal
+    //    printf("AAAA DIRECT\n");
+    for (int i = 0; i < imgs_num; i++)
+    {
+        for (int j = 0; j < fnum; j = j + 8)
+        {
+            for (int k = 0; k < 8; k = k + 1)
+            {
+                norm_a += in_a[i][j + k] * in_a[i][j + k];
+
+                norm_b += in_b[i][j + k] * in_b[i][j + k];
+
+                a_b += in_a[i][j + k] * in_b[i][j + k];
+            }
+        }
+    }
+
+#endif // __SSE__ __AVX__
+
+    for (int j = fnum; j < output_num; j++)
+    {
+        for (int i = 0; i < imgs_num; i++)
+        {
+            norm_a += in_a[i][j] * in_a[i][j];
+            norm_b += in_b[i][j] * in_b[i][j];
+            a_b += in_a[i][j] * in_b[i][j];
+        }
+    }
+
+    double cosin = 0.0;
+    double _a_b_ = sqrt(norm_a) * sqrt(norm_b);
+    if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f)
+        cosin = a_b;
+    else
+        cosin = a_b / _a_b_;
+    if (cosin < -999999 || cosin > 999999)
+        cosin = 0;
+    return cosin;
+}
+
+double QuantTool::cosin_similarity(std::vector<float>* in_a, std::vector<float>* in_b, uint32_t imgs_num, uint32_t output_num)
+{
+    uint32_t output_channel = 1;
+    std::vector<double> norm_a(output_channel, 0.0);
+    std::vector<double> norm_b(output_channel, 0.0);
+    std::vector<double> a_b(output_channel, 0.0);
+
+    int elem_perchannel = int(output_num / output_channel);
+
+    for (int i = 0; i < imgs_num; i++)
+    {
+        for (int j = 0; j < output_channel; j++)
+        {
+            for (int k = 0; k < elem_perchannel; k++)
+            {
+                int elem_idx = j * elem_perchannel + k;
+                norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx];
+                norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx];
+                a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx];
+            }
+        }
+    }
+
+    double cosin;
+    for (int j = 0; j < output_channel; j++)
+    {
+        double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]);
+        if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f)
+            cosin = a_b[j];
+        else
+            cosin = a_b[j] / _a_b_;
+        if (cosin < -999999 || cosin > 999999)
+            cosin = 0;
+    }
+    return cosin;
+}
+
+void QuantTool::weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel)
+{
+    float* scale_list = (float*)sys_malloc(elem_channel * 4);
+    int* zero_point_list = (int*)sys_malloc(elem_channel * 4);
+
+    int elem_perchannel = elem_num / elem_channel;
+
+    float fake_quant_max;
+    float fake_quant_min;
+
+    if (symmetry == 1)
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = -fake_quant_max;
+    }
+    else
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = 0;
+    }
+
+    float scale = 1;
+    int zero_point = 0;
+    for (int c = 0; c < elem_channel; c++)
+    {
+        float weight_max = *std::max_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel);
+        float weight_min = *std::min_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel);
+        if (symmetry == 1)
+        {
+            if (abs(weight_max) > abs(weight_min))
+                scale = abs(weight_max) / fake_quant_max;
+            else
+                scale = abs(weight_min) / fake_quant_max;
+            zero_point = 0;
+        }
+        else
+        {
+            scale = (weight_max - weight_min) / fake_quant_max;
+            zero_point = int(-weight_min / scale);
+        }
+
+        scale_list[c] = scale;
+        zero_point_list[c] = zero_point;
+    }
+
+    if (weight_tensor->scale_list == NULL)
+    {
+        //        printf(" EMPTY\n ");
+        weight_tensor->scale_list = scale_list;
+        weight_tensor->zp_list = zero_point_list;
+    }
+    else
+    {
+        scale_list = weight_tensor->scale_list;
+        zero_point_list = weight_tensor->zp_list;
+    }
+
+    int data_idx;
+    for (int i = 0; i < elem_channel; i++)
+    {
+        for (int j = 0; j < elem_perchannel; j++)
+        {
+            data_idx = i * elem_perchannel + j;
+            if (scale_list[i] == 0)
+                data[data_idx] = 0;
+            else
+            {
+                data[data_idx] = round(data[data_idx] / scale_list[i]) + zero_point_list[i];
+                data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx];
+                data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx];
+                data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale_list[i];
+            }
+        }
+    }
+}
+
+void QuantTool::conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t)
+{
+    int nn_outch = M >> 3;
+    int remain_outch_start = nn_outch << 3;
+
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 8;
+
+        const float* k0 = pA + (p + 0) * K;
+        const float* k1 = pA + (p + 1) * K;
+        const float* k2 = pA + (p + 2) * K;
+        const float* k3 = pA + (p + 3) * K;
+        const float* k4 = pA + (p + 4) * K;
+        const float* k5 = pA + (p + 5) * K;
+        const float* k6 = pA + (p + 6) * K;
+        const float* k7 = pA + (p + 7) * K;
+
+        float* ktmp = pA_t + (p / 8) * 8 * K;
+
+        for (int q = 0; q < K; q++)
+        {
+            ktmp[0] = k0[0];
+            ktmp[1] = k1[0];
+            ktmp[2] = k2[0];
+            ktmp[3] = k3[0];
+            ktmp[4] = k4[0];
+            ktmp[5] = k5[0];
+            ktmp[6] = k6[0];
+            ktmp[7] = k7[0];
+            ktmp += 8;
+
+            k0 += 1;
+            k1 += 1;
+            k2 += 1;
+            k3 += 1;
+            k4 += 1;
+            k5 += 1;
+            k6 += 1;
+            k7 += 1;
+        }
+    }
+
+    nn_outch = (M - remain_outch_start) >> 2;
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
+
+        const float* k0 = pA + (p + 0) * K;
+        const float* k1 = pA + (p + 1) * K;
+        const float* k2 = pA + (p + 2) * K;
+        const float* k3 = pA + (p + 3) * K;
+
+        float* ktmp = pA_t + (p / 8 + (p % 8) / 4) * 8 * K;
+
+        for (int q = 0; q < K; q++)
+        {
+            ktmp[0] = k0[0];
+            ktmp[1] = k1[0];
+            ktmp[2] = k2[0];
+            ktmp[3] = k3[0];
+            ktmp += 4;
+
+            k0 += 1;
+            k1 += 1;
+            k2 += 1;
+            k3 += 1;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+
+    for (int p = remain_outch_start; p < M; p++)
+    {
+        const float* k0 = pA + (p + 0) * K;
+
+        float* ktmp = pA_t + (p / 8 + (p % 8) / 4 + p % 4) * 8 * K;
+
+        for (int q = 0; q < K; q++)
+        {
+            ktmp[0] = k0[0];
+            ktmp++;
+            k0++;
+        }
+    }
+}
+
+void QuantTool::gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel)
+{
+    float* scale_list = (float*)sys_malloc(elem_channel * 4);
+    int* zero_point_list = (int*)sys_malloc(elem_channel * 4);
+
+    int elem_perchannel = elem_num / elem_channel;
+
+    float fake_quant_max;
+    float fake_quant_min;
+
+    if (symmetry == 1)
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = -fake_quant_max;
+    }
+    else
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = 0;
+    }
+
+    float scale = 1;
+    int zero_point = 0;
+    for (int c = 0; c < elem_channel; c++)
+    {
+        float weight_max = *std::max_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel);
+        float weight_min = *std::min_element(data + c * elem_perchannel, data + (c + 1) * elem_perchannel);
+        if (symmetry == 1)
+        {
+            if (abs(weight_max) > abs(weight_min))
+                scale = abs(weight_max) / fake_quant_max;
+            else
+                scale = abs(weight_min) / fake_quant_max;
+            zero_point = 0;
+        }
+        else
+        {
+            scale = (weight_max - weight_min) / fake_quant_max;
+            zero_point = int(-weight_min / scale);
+        }
+
+        scale_list[c] = scale;
+        zero_point_list[c] = zero_point;
+    }
+
+    weight_tensor->scale_list = scale_list;
+    weight_tensor->zp_list = zero_point_list;
+}
+
+void QuantTool::bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
+                             float* data, int elem_num, int elem_channel)
+{
+    int elem_perchannel = elem_num / elem_channel;
+    float* scale_list = (float*)sys_malloc(elem_channel * 4);
+
+    for (int c = 0; c < elem_channel; c++)
+    {
+        float input_scale = input_tensor->scale;
+        float weight_scale = weight_tensor->scale_list[c];
+        float bias_scale = input_scale * weight_scale;
+        scale_list[c] = bias_scale;
+    }
+
+    bias_tensor->scale_list = scale_list;
+
+    int data_idx;
+    for (int i = 0; i < elem_channel; i++)
+    {
+        for (int j = 0; j < elem_perchannel; j++)
+        {
+            data_idx = i * elem_perchannel + j;
+            if (scale_list[i] == 0)
+            {
+                data[data_idx] = 0;
+            }
+            else
+            {
+                data[data_idx] = round(data[data_idx] / scale_list[i]);
+                data[data_idx] = data[data_idx] * scale_list[i];
+            }
+        }
+    }
+}
+
+void QuantTool::weight_bias_reset()
+{
+    if (this->op_name == OP_CONV || this->op_name == OP_FC)
+    {
+        std::memcpy(this->weight_data_fake_quant, this->weight_data_fp32, this->weight_size);
+        std::memcpy(this->interleave_buffer_fake_quant, this->interleave_buffer_fp32, this->interleave_size_fake);
+        if (this->node_fake_quant->ir_node->input_num > 2)
+        {
+            memcpy(this->bias_data_fake_quant, this->bias_data_fp32, this->bias_size);
+        }
+    }
+}
+
+void QuantTool::free_used_layers(int idx)
+{
+    //    printf("#### free 0 idx %d\n",idx);
+    if (this->dict_free[idx].size() > 0)
+    {
+        //        printf("#### free 1 idx %d\n",idx);
+        std::vector<std::vector<float> > freen_fp32;
+        std::vector<std::vector<float> > freen_fake_quant;
+        for (int fi = 0; fi < this->dict_free[idx].size(); fi++)
+        {
+            if (this->dict_free[idx][fi] != 0)
+            {
+                //                printf("---free---\n");
+                this->fp32_out[this->dict_free[idx][fi]].clear();
+                this->fake_quant_out[this->dict_free[idx][fi]].clear();
+            }
+        }
+    }
+}
+
+void QuantTool::load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc)
+{
+    std::unordered_map<std::string, float> layer_scale;
+    std::unordered_map<std::string, float> layer_zeropoint;
+    bool parse_from_file = false;
+    if (nullptr != scale_file)
+    {
+        std::ifstream scales(scale_file);
+        std::string line;
+        while (std::getline(scales, line))
+        {
+            std::string layer_name;
+            float scale_val = 0.f;
+            float zero_point = 0.f;
+            size_t last = 0;
+            size_t index = line.find_first_of(" ", last);
+            size_t idx = line.find_last_of(" ", line.size());
+            layer_name = line.substr(last, index - last);
+            //            printf("layer_name : %s \n", layer_name.c_str());
+            last = index + 1;
+            scale_val = atof((line.substr(last, line.size() - last)).c_str());
+            zero_point = atof((line.substr(idx + 1, line.size())).c_str());
+
+            layer_scale[layer_name] = scale_val;
+            layer_zeropoint[layer_name] = zero_point;
+            //            fprintf(stderr, "quant value : %s %f %f \n", layer_name.c_str(), scale_val, zero_point);
+        }
+    }
+
+    std::unordered_map<std::string, int> layer_used;
+    for (int i = 0; i < graphn->node_num; i++)
+    {
+        struct node* noden = graphn->node_list[i];
+        for (int j = 0; j < noden->input_num; j++)
+        {
+            std::string layern = graphn->tensor_list[noden->input_tensors[j]]->name;
+            layer_used[layern]++;
+        }
+    }
+
+    if (mode_sc == 0)
+    {
+        for (int i = 0; i < graphn->tensor_num; i++)
+        {
+            struct tensor* t = graphn->tensor_list[i];
+            if (t->tensor_type == 1 || t->tensor_type == 3)
+            {
+                t->scale = layer_scale[t->name];
+                t->zero_point = layer_zeropoint[t->name];
+            }
+        }
+    }
+    else
+    {
+        std::unordered_map<std::string, int> layer_pass;
+        for (int i = graphn->tensor_num - 1; i >= 0; i--)
+        {
+            struct tensor* t = graphn->tensor_list[i];
+            if (t->tensor_type == 1 || t->tensor_type == 3)
+            {
+                if (layer_pass[t->name] == 0)
+                {
+                    uint32_t ir_node_idx = t->producer;
+                    struct node* t_node = graphn->node_list[ir_node_idx];
+
+                    auto op_name = t_node->op.type;
+
+                    bool poolTrue = false;
+                    bool reluTrue = false;
+                    if (op_name == OP_POOL)
+                    {
+                        struct pool_param* pool_param = (struct pool_param*)t_node->op.param_mem;
+                        if (pool_param->pool_method == 0)
+                            poolTrue = true;
+                    }
+                    else if (op_name == OP_RELU)
+                    {
+                        struct relu_param* relu_param = (struct relu_param*)t_node->op.param_mem;
+                        if (relu_param->negative_slope == 0.f)
+                            reluTrue = true;
+                    }
+
+                    if (op_name == OP_FLATTEN || op_name == OP_RESHAPE || op_name == OP_SQUEEZE || op_name == OP_CLIP || poolTrue || reluTrue)
+                    {
+                        struct tensor* t_in_tensor = graphn->tensor_list[t_node->input_tensors[0]];
+                        if (layer_scale[t->name] != 0)
+                        {
+                            t->scale = layer_scale[t->name];
+                            t->zero_point = layer_zeropoint[t->name];
+
+                            if (t_in_tensor->tensor_type == 1 || t_in_tensor->tensor_type == 3)
+                            {
+                                this->recursion_pass_through(graphn, t->name, t_in_tensor, layer_used, layer_scale,
+                                                             layer_zeropoint, layer_pass);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        t->scale = layer_scale[t->name];
+                        t->zero_point = layer_zeropoint[t->name];
+                    }
+                    layer_pass[t->name] = 1;
+                }
+            }
+        }
+    }
+
+    //    for (int i = 0; i < graphn->tensor_num; i++)
+    //    {
+    //        struct ir_tensor* t = graphn->tensor_list[i];
+    //        if (t->tensor_type == 1 || t->tensor_type == 3)
+    //        {
+    //            printf(" sz %s %f %d \n",t->name, t->scale, t->zero_point);
+    //        }
+    //    }
+}
+
+int QuantTool::get_exec_node_message(int exec_node_idx)
+{
+    /* get node */
+    this->node_fp32 = (struct exec_node*)get_vector_data(this->exec_graph_fp32->exec_node_list, exec_node_idx);
+    this->node_fake_quant = (struct exec_node*)get_vector_data(this->exec_graph_fake_quant->exec_node_list, exec_node_idx);
+
+    /* get op type */
+    this->op_name = this->node_fp32->ir_node->op.type;
+
+    /* get exec ops */
+    this->node_ops_fp32 = this->node_fp32->node_ops;
+    this->node_ops_fake_quant = this->node_fake_quant->node_ops;
+
+    /* handle the shape changed  and dynamic shape case */
+    if (this->node_ops_fp32->reshape && this->node_ops_fp32->reshape(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32)
+        && this->node_ops_fake_quant->reshape && this->node_ops_fake_quant->reshape(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant) < 0)
+    {
+        TLOG_ERR("failed to reshape node %d, %s\n", node_fp32->ir_node->index, node_fp32->ir_node->name);
+        return -1;
+    }
+
+    /* get output tensor */
+    this->output_tensor_fp32 = this->graphn_fp32->tensor_list[this->node_fp32->ir_node->output_tensors[0]];
+    this->output_tensor_fake_quant = this->graphn_fake_quant->tensor_list[this->node_fake_quant->ir_node->output_tensors[0]];
+
+    /* get exec ops */
+    this->execidx_elemnum[exec_node_idx] = this->output_tensor_fp32->elem_num;   //exec idx --> output elem num
+    this->execidx_elemsize[exec_node_idx] = this->output_tensor_fp32->elem_size; //exec idx --> output elem size
+    this->execidx_nodename[exec_node_idx] = this->output_tensor_fp32->name;      //exec idx --> output tensor name
+
+    return 0;
+}
+
+void QuantTool::cosin_similarity(std::vector<double>& cosin, std::vector<std::vector<float> >& in_a, std::vector<std::vector<float> >& in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel) // cosin dis perchannel
+{
+    //    fprintf(stderr, " in_a %f ",in_a[0][0]);
+    //    fprintf(stderr, " in_b %f ",in_b[0][0]);
+
+    std::vector<double> norm_a(output_channel, 0.0);
+    std::vector<double> norm_b(output_channel, 0.0);
+    std::vector<double> a_b(output_channel, 0.0);
+
+    int elem_perchannel = int(output_num / output_channel);
+
+    for (int i = 0; i < imgs_num; i++)
+    {
+        for (int j = 0; j < output_channel; j++)
+        {
+            for (int k = 0; k < elem_perchannel; k++)
+            {
+                int elem_idx = j * elem_perchannel + k;
+                norm_a[j] += in_a[i][elem_idx] * in_a[i][elem_idx];
+                norm_b[j] += in_b[i][elem_idx] * in_b[i][elem_idx];
+                a_b[j] += in_a[i][elem_idx] * in_b[i][elem_idx];
+            }
+        }
+    }
+
+    cosin.resize(output_channel);
+    for (int j = 0; j < output_channel; j++)
+    {
+        double _a_b_ = sqrt(norm_a[j]) * sqrt(norm_b[j]);
+        //        fprintf(stderr, " %lf %f %f \n ", _a_b_, sqrt(norm_a[j]), sqrt(norm_b[j]) );
+        if (_a_b_ < 0.0000001f && _a_b_ > -0.0000001f)
+            cosin[j] = a_b[j];
+        else
+            cosin[j] = a_b[j] / _a_b_;
+        if (cosin[j] < -999999 || cosin[j] > 999999)
+            cosin[j] = 0;
+    }
+}
+
+int QuantTool::assess_quant_loss(int gen)
+{
+    this->init();
+    for (int i = 0; i < this->exec_node_num; i++)
+    {
+        this->get_exec_node_message(i);
+        this->check_for_interlearve();
+
+        this->out_imgs_fp32.resize(this->max_search_img_num);
+        this->out_imgs_fake_quant.resize(this->max_search_img_num);
+        if (this->op_name == OP_CONV || this->op_name == OP_FC)
+            this->weight_bias_requant(gen);
+
+        for (int imgi = 0; imgi < this->max_search_img_num; imgi++)
+        {
+            this->set_node_input_output_tensor(i, imgi, 0);
+
+            /* op run */
+            this->node_ops_fp32->run(this->node_ops_fp32, this->node_fp32, this->exec_graph_fp32);
+            this->node_ops_fake_quant->run(this->node_ops_fake_quant, this->node_fake_quant, this->exec_graph_fake_quant);
+            this->activation_requant(this->out_imgs_fake_quant[imgi].data(), this->output_tensor_fake_quant->elem_num, 8, 1, this->output_tensor_fake_quant->scale, this->output_tensor_fake_quant->zero_point);
+        }
+
+        if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1))
+            this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], this->weight_tensor_fp32->dims[0]);
+        else
+            this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1);
+
+        if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1))
+            this->print_cosin(this->cosin.data(), i, this->weight_tensor_fp32->dims[0]);
+        else
+            this->print_cosin(this->cosin.data(), i, 1);
+        //        fprintf(stderr, "cosin [%s] : %f\n", execidx_nodename[i].c_str(), cosin);
+
+        this->weight_bias_reset();
+        this->free_used_layers(i);
+
+        /* save node output */
+        this->fp32_out.push_back(this->out_imgs_fp32);
+        this->fake_quant_out.push_back(this->out_imgs_fake_quant);
+    }
+
+    return 0;
+}
+
+void QuantTool::print_cosin(double* cosin, int idx, int output_channel)
+{
+    float avg_cosin = 0;
+    float avg_num = 0;
+    for (int c = 0; c < output_channel; c++)
+    {
+        if (cosin[c] != 0)
+        {
+            avg_cosin += cosin[c];
+            avg_num++;
+        }
+    }
+    fprintf(stderr, "cosin %3d  %4d  avg  %0.6f  ### ", idx, output_channel, avg_cosin / avg_num);
+    for (int c = 0; c < output_channel; c++)
+    {
+        fprintf(stderr, "%0.6f ", cosin[c]);
+    }
+    fprintf(stderr, "\n");
+}
+
+void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom)
+{
+    float* scale_list = (float*)weight_tensor->scale_list;
+    int* zero_point_list = (int*)weight_tensor->zp_list;
+
+    int elem_perchannel = elem_num / elem_channel;
+
+    float fake_quant_max;
+    float fake_quant_min;
+
+    if (symmetry == 1)
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = -fake_quant_max;
+    }
+    else
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = 0;
+    }
+
+    int data_idx;
+    for (int i = 0; i < elem_channel; i++)
+    {
+        float scale = scale_list[i] * zoom;
+        for (int j = 0; j < elem_perchannel; j++)
+        {
+            data_idx = i * elem_perchannel + j;
+            if (scale_list[i] == 0)
+                data[data_idx] = 0;
+            else
+            {
+                data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i];
+                data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx];
+                data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx];
+                data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale;
+            }
+        }
+    }
+}
+void QuantTool::weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom)
+{
+    float* scale_list = (float*)weight_tensor->scale_list;
+    int* zero_point_list = (int*)weight_tensor->zp_list;
+
+    int elem_perchannel = elem_num / elem_channel;
+
+    float fake_quant_max;
+    float fake_quant_min;
+
+    if (symmetry == 1)
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = -fake_quant_max;
+    }
+    else
+    {
+        fake_quant_max = pow(2, bitcount - symmetry) - 1;
+        fake_quant_min = 0;
+    }
+
+    int data_idx;
+    for (int i = 0; i < elem_channel; i++)
+    {
+        float scale = 1;
+        if (zoom[i] > 5)
+            scale = scale_list[i];
+        else
+            scale = scale_list[i] * zoom[i];
+        for (int j = 0; j < elem_perchannel; j++)
+        {
+            data_idx = i * elem_perchannel + j;
+            if (scale_list[i] == 0)
+                data[data_idx] = 0;
+            else
+            {
+                data[data_idx] = round(data[data_idx] / scale) + zero_point_list[i];
+                data[data_idx] = data[data_idx] > fake_quant_max ? fake_quant_max : data[data_idx];
+                data[data_idx] = data[data_idx] < fake_quant_min ? fake_quant_min : data[data_idx];
+                data[data_idx] = (data[data_idx] - zero_point_list[i]) * scale;
+            }
+        }
+    }
+}
+
+int QuantTool::quant_search()
+{
+    this->init();
+    for (int i = 0; i < this->exec_node_num; i++)
+    {
+        this->get_exec_node_message(i);
+        this->check_for_interlearve();
+
+        this->out_imgs_fp32.resize(this->max_search_img_num);
+        this->out_imgs_fake_quant.resize(this->max_search_img_num);
+
+        if (this->op_name == OP_CONV || this->op_name == OP_FC)
+        {
+            this->gen_weight_scale(this->weight_tensor_fake_quant, this->weight_data_fake_quant, this->weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0]);
+            this->gen_weight_scale(this->weight_tensor_fp32, this->weight_data_fp32, this->weight_tensor_fp32->elem_num, 8, 1, weight_tensor_fp32->dims[0]);
+
+            std::vector<double> cosin_save(weight_tensor_fake_quant->dims[0], -1);
+            std::vector<float> zoom_save(weight_tensor_fake_quant->dims[0], -1);
+            for (int snum = 0; snum < 201; snum = snum + 20)
+            {
+                float zoom = 1.3 / 200 * (snum + 1);
+                //                float zoom = 1.0;
+                /* weight requant */
+                if (snum < 200)
+                    this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom);
+                else
+                {
+                    this->weight_requant_search(weight_tensor_fake_quant, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, 8, 1, weight_tensor_fake_quant->dims[0], zoom_save.data());
+                    float* buf = (float*)sys_malloc(weight_tensor_fake_quant->dims[0] * 4);
+                    memcpy(buf, zoom_save.data(), weight_tensor_fake_quant->dims[0] * 4);
+                    //                    printf(" scale3 %f \n",weight_tensor_fp32->scale_list[0]);
+                    for (int bi = 0; bi < weight_tensor_fake_quant->dims[0]; bi++)
+                    {
+                        buf[bi] *= weight_tensor_fp32->scale_list[bi];
+                    }
+                    //                    printf(" scale4 %f \n",buf[0]);
+                    //                     weight_tensor_fake_quant->scale_list = buf;
+                    weight_tensor_fp32->scale_list = buf;
+                    weight_tensor_fp32->quant_param_num = weight_tensor_fp32->dims[0];
+                    //                    printf(" scale5 %f \n",weight_tensor_fp32->scale_list[0]);
+                }
+                if (interleave_size_fake != 0)
+                {
+                    int M = weight_tensor_fake_quant->dims[0];
+                    int K = weight_tensor_fake_quant->elem_num / weight_tensor_fake_quant->dims[0];
+                    this->conv_hcl_interleave_pack4_fp32(M, K, weight_data_fake_quant, interleave_buffer_fake_quant);
+                }
+
+                /* bias requant */
+                if (node_fake_quant->ir_node->input_num > 2)
+                {
+                    struct tensor* input_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[0]];
+                    struct tensor* bias_tensor_fake_quant = graphn_fake_quant->tensor_list[node_fake_quant->ir_node->input_tensors[2]];
+                    struct tensor* bias_tensor_fp32 = graphn_fp32->tensor_list[node_fp32->ir_node->input_tensors[2]];
+
+                    bias_size = bias_tensor_fp32->elem_num * bias_tensor_fp32->elem_size;
+
+                    bias_data_fp32 = (float*)bias_tensor_fp32->data;
+                    bias_data_fake_quant = (float*)bias_tensor_fake_quant->data;
+
+                    this->bias_requant(input_tensor_fake_quant, weight_tensor_fake_quant, bias_tensor_fake_quant,
+                                       bias_data_fake_quant, bias_tensor_fake_quant->elem_num, bias_tensor_fake_quant->dims[0]);
+                }
+
+                /* per image run */
+                for (int imgi = 0; imgi < this->max_search_img_num; imgi++)
+                {
+                    this->set_node_input_output_tensor(i, imgi, snum);
+
+                    /* FP32 op run */
+                    if (snum == 0)
+                    {
+                        //                        set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size);
+                        node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32);
+
+                        this->execidx_elemnum[i] = output_tensor_fp32->elem_num;   //exec idx --> output elem num
+                        this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size
+                        this->execidx_nodename[i] = output_tensor_fp32->name;
+                    }
+
+                    /* fake quant op run */
+                    //                    set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size);
+                    node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant);
+                    this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point);
+                } // image number
+
+                output_channel = output_tensor_fp32->dims[1];
+
+                if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1))
+                    this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel);
+                else
+                    this->cosin_similarity(this->cosin, this->out_imgs_fp32, this->out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1);
+
+                //                this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], output_channel);
+
+                for (int cosi = 0; cosi < output_channel; cosi++)
+                {
+                    if (cosin[cosi] > cosin_save[cosi])
+                    {
+                        cosin_save[cosi] = cosin[cosi];
+                        zoom_save[cosi] = zoom;
+                    }
+                }
+                if (snum == 200)
+                {
+                    if (this->op_name == OP_CONV || (this->op_name == OP_FC && this->max_search_img_num > 1))
+                        this->print_cosin(this->cosin.data(), i, output_channel);
+                    else
+                        this->print_cosin(this->cosin.data(), i, 1);
+                }
+
+                if (op_name == OP_CONV || op_name == OP_FC)
+                {
+                    memcpy(weight_data_fake_quant, weight_data_fp32, weight_size);
+                    //                    this->weight_correction(weight_data_fp32, weight_data_fake_quant, weight_tensor_fake_quant->elem_num, this->bitcount, this->symmetry, weight_tensor_fake_quant->dims[0]);
+                    memcpy(interleave_buffer_fake_quant, interleave_buffer_fp32, interleave_size_fake);
+                    if (node_fake_quant->ir_node->input_num > 2)
+                    {
+                        memcpy(bias_data_fake_quant, bias_data_fp32, bias_size);
+                    }
+                }
+            }
+        }
+        else
+        {
+            /* per image run */
+            for (int imgi = 0; imgi < this->max_search_img_num; imgi++)
+            {
+                this->set_node_input_output_tensor(i, imgi, 0);
+
+                //                set_tensor_buffer(output_tensor_fp32, out_imgs_fp32[imgi].data(), output_tensor_fp32->elem_num * output_tensor_fp32->elem_size);
+                node_ops_fp32->run(node_ops_fp32, node_fp32, exec_graph_fp32);
+
+                /* fake quant op run */
+                //                set_tensor_buffer(output_tensor_fake_quant, out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num * output_tensor_fake_quant->elem_size);
+                node_ops_fake_quant->run(node_ops_fake_quant, node_fake_quant, exec_graph_fake_quant);
+                this->activation_requant(out_imgs_fake_quant[imgi].data(), output_tensor_fake_quant->elem_num, 8, 1, output_tensor_fake_quant->scale, output_tensor_fake_quant->zero_point);
+
+                this->execidx_elemnum[i] = output_tensor_fp32->elem_num;   //exec idx --> output elem num
+                this->execidx_elemsize[i] = output_tensor_fp32->elem_size; //exec idx --> output elem size
+                this->execidx_nodename[i] = output_tensor_fp32->name;
+            }
+            this->cosin_similarity(this->cosin, out_imgs_fp32, out_imgs_fake_quant, this->max_search_img_num, this->execidx_elemnum[i], 1);
+            this->print_cosin(this->cosin.data(), i, 1);
+            this->execidx_loss[i] = cosin;
+        }
+
+        this->free_used_layers(i);
+
+        /* save node output */
+        this->fp32_out.push_back(this->out_imgs_fp32);
+        this->fake_quant_out.push_back(this->out_imgs_fake_quant);
+    } // node number
+      //    fprintf(stderr, "--------------------------------------\n");
+
+    if (!save_graph(graphn_fp32, "save_i8_eq.tmfile"))
+    {
+        fprintf(stderr, "save graph failed.\n");
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/tools/quantize/quant_save_graph.cpp b/tools/quantize/quant_save_graph.cpp
index 410db6be3..0ff72f180 100644
--- a/tools/quantize/quant_save_graph.cpp
+++ b/tools/quantize/quant_save_graph.cpp
@@ -505,6 +505,11 @@ int save_graph_i8_perchannel(const char* model_file, const char* scale_file, con
             if (internal)
             {
                 // TODO
+                for (int ch = 0; ch < channel_num; ch++)
+                {
+                    weight_scale_list[ch] = weight_tensor->scale_list[ch];
+                    weight_zp_list[ch] = 0;
+                }
             }
             else
             {
diff --git a/tools/quantize/quant_tool.hpp b/tools/quantize/quant_tool.hpp
index cc8d9ae13..99b43beba 100644
--- a/tools/quantize/quant_tool.hpp
+++ b/tools/quantize/quant_tool.hpp
@@ -22,9 +22,14 @@
  * Author: hhchen@openailab.com
  */
 
+#pragma once
+
 #include <string>
 #include <vector>
 #include <unordered_map>
+#include <fstream>
+#include <cstring>
+#include <algorithm>
 
 extern "C" {
 #include "api/c_api.h"
@@ -34,11 +39,40 @@ extern "C" {
 #include "graph/tensor.h"
 #include "utility/sys_port.h"
 #include "utility/utils.h"
+#include "utility/log.h"
+#include "utility/vector.h"
+
+#include "../source/device/cpu/cpu_node.h"
+#include "../source/device/cpu/cpu_graph.h"
+
+#include "convolution_param.h"
+#include "fc_param.h"
+#include "pooling_param.h"
+#include "relu_param.h"
 }
 
+#include "quant_utils.hpp"
+#include "quant_save_graph.hpp"
+
+typedef std::unordered_map<std::string, int> dict_str2int;
+typedef std::unordered_map<std::string, float> dict_str2float;
+typedef std::unordered_map<uint32_t, uint32_t> dict_uint2uint;
+typedef std::unordered_map<uint32_t, std::vector<uint32_t> > dict_uint2vecuint;
+typedef std::unordered_map<uint32_t, std::string> dict_uint2str;
+typedef std::unordered_map<uint32_t, std::vector<double> > dict_uint2doublex;
+
 #define ALGORITHM_MIN_MAX 0
 #define ALGORITHM_KL      1
 #define ALGORITHM_ACIQ    2
+#define ALGORITHM_DFQ     3
+#define ALGORITHM_MM_EQ   4
+
+struct node_graph
+{
+    int pass;
+    std::vector<uint16_t> input_node_list;
+    std::vector<uint16_t> output_node_list;
+};
 
 class QuantTool
 {
@@ -46,7 +80,41 @@ class QuantTool
     QuantTool();
     ~QuantTool();
 
+    int init();
     int activation_quant_tool();
+    int assess_quant_loss(int gen);
+    int quant_search();
+    int data_free_quant();
+
+private:
+    void recursion_pass_through(struct graph* graphn, const char* layer_name, struct tensor* t,
+                                dict_str2int& layer_used, dict_str2float& layer_scale,
+                                dict_str2float& layer_zeropoint, dict_str2int& layer_pass);
+
+    struct exec_graph* get_exec_graph(struct graph* graphn);
+    void load_activation_scale(struct graph* graphn, const char* scale_file, int mode_sc);
+    int prerun_for_get_ir_tensor(void* graph, struct options opt);
+    void check_for_free();
+
+    void check_for_interlearve();
+    void weight_bias_requant(int search);
+    void conv_hcl_interleave_pack4_fp32(int M, int K, float* pA, float* pA_t);
+    void activation_requant(float* data, int elem_num, int bitcount, int symmetry, float scale, int zero_point = 0);
+    void weight_requant(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel);
+    void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float zoom);
+    void weight_requant_search(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel, float* zoom);
+    void bias_requant(struct tensor* input_tensor, struct tensor* weight_tensor, struct tensor* bias_tensor,
+                      float* data, int elem_num, int elem_channel);
+    void set_node_input_output_tensor(int idx, int imgi, int snum);
+    double cosin_similarity(std::vector<float>* in_a, std::vector<float>* in_b, uint32_t imgs_num, uint32_t output_num);
+    double cosin_similarity(std::vector<std::vector<float> >& in_a, std::vector<std::vector<float> >& in_b, uint32_t imgs_num, uint32_t output_num);
+    void cosin_similarity(std::vector<double>& cosin, std::vector<std::vector<float> >& in_a, std::vector<std::vector<float> >& in_b, uint32_t imgs_num, uint32_t output_num, uint32_t output_channel); // cosin dis perchannel
+    void weight_bias_reset();
+    void free_used_layers(int idx);
+    void gen_weight_scale(struct tensor* weight_tensor, float* data, int elem_num, int bitcount, int symmetry, int elem_channel);
+    int get_exec_node_message(int exec_node_idx);
+
+    void print_cosin(double* cosin, int idx, int output_channel);
 
 public:
     struct options opt;
@@ -70,4 +138,72 @@ class QuantTool
     int focus;          // flag which indicates that focus process image is necessary(maybe using for YOLOv5, 0:OFF, 1:ON, default is 0)
     int inplace;        // process the inplace quant scale of activation in some types of op, such as max pooling, ReLU, Flatten, Reshape, Clip
     int algorithm_type; // the type of quant algorithm(0:min-max, 1:kl, default is 0)
+    bool evaluate;      // evaluate quantitative losses
+
+private: // system variable
+    dict_uint2uint ir_exec;
+    dict_uint2uint exec_ir;
+    dict_uint2vecuint dict_free;
+    dict_uint2uint execidx_elemnum;
+    dict_uint2uint execidx_elemsize;
+    dict_uint2str execidx_nodename;
+    dict_uint2doublex execidx_loss;
+
+    int max_search_img_num;
+
+    std::vector<double> cosin;
+
+private: // basic message
+    int img_size;
+    double cosin_max;
+    float scale_acc;
+
+private: // ir graph variable
+    std::vector<std::vector<std::vector<float> > > fp32_out;
+    std::vector<std::vector<std::vector<float> > > fake_quant_out;
+    std::vector<std::vector<float> > input_datas_fp32;
+    std::vector<std::vector<float> > input_datas_fake_quant;
+    std::vector<std::vector<float> > out_imgs_fp32;
+    std::vector<std::vector<float> > out_imgs_fake_quant;
+
+    struct graph* graphn_fp32;
+    struct graph* graphn_fake_quant;
+    struct tensor* graph_input_tensor_fp32;
+    struct tensor* graph_input_tensor_fake_quant;
+    struct exec_graph* exec_graph_fp32;
+    struct exec_graph* exec_graph_fake_quant;
+    int exec_node_num;
+
+private: // temp variable
+    uint16_t op_name;
+
+    struct exec_node* node_fp32;
+    struct exec_node* node_fake_quant;
+    struct node_ops* node_ops_fp32;
+    struct node_ops* node_ops_fake_quant;
+
+    struct tensor* input_tensor_fp32;
+    struct tensor* input_tensor_fake_quant;
+    struct tensor* weight_tensor_fp32;
+    struct tensor* weight_tensor_fake_quant;
+    struct tensor* bias_tensor_fp32;
+    struct tensor* bias_tensor_fake_quant;
+    struct tensor* output_tensor_fp32;
+    struct tensor* output_tensor_fake_quant;
+
+    float* weight_data_fp32;
+    float* weight_data_fake_quant;
+    uint32_t weight_size;
+    float* interleave_buffer_fp32;
+    float* interleave_buffer_fake_quant;
+    uint32_t interleave_size_fake;
+    float* bias_data_fp32;
+    float* bias_data_fake_quant;
+    uint32_t bias_size;
+    uint32_t output_channel;
+
+    struct conv_priv_info* conv_priv_info_fp32;
+    struct conv_priv_info* conv_priv_info_fake_quant;
+    struct conv_param* conv_param_fp32;
+    struct conv_param* conv_param_fake_quant;
 };
diff --git a/tools/quantize/quant_tool_int8.cpp b/tools/quantize/quant_tool_int8.cpp
index 4e318d9e4..3c63ebf0b 100644
--- a/tools/quantize/quant_tool_int8.cpp
+++ b/tools/quantize/quant_tool_int8.cpp
@@ -66,6 +66,7 @@ QuantTool::QuantTool()
     this->focus = 0;
     this->inplace = true;
     this->algorithm_type = ALGORITHM_MIN_MAX;
+    this->evaluate = false;
 }
 
 QuantTool::~QuantTool()
@@ -163,6 +164,7 @@ int QuantTool::activation_quant_tool()
     /* init minmax */
     std::unordered_map<int, float> max_activation;
     std::unordered_map<int, float> min_activation;
+    std::unordered_map<int, int> act_map;
     uint32_t act_tensor_num = 0;
     for (int i = 0; i < ir_graph->tensor_num; i++)
     {
@@ -172,6 +174,7 @@ int QuantTool::activation_quant_tool()
             act_tensor_num++;
             max_activation[i] = -FLT_MAX;
             min_activation[i] = FLT_MAX;
+            act_map[act_tensor_num - 1] = i;
         }
     }
 
@@ -213,10 +216,134 @@ int QuantTool::activation_quant_tool()
             }
         }
     }
+    fprintf(stderr, "\n");
     if (this->algorithm_type == ALGORITHM_KL)
     {
-        /* todo support */
-        fprintf(stderr, "\r\n[****WARNING****]:Step 2 find original calibration kl threshold table NOT support temporarily!\n");
+        /* kl process divergence */
+        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table.\n");
+        std::unordered_map<uint32_t, uint32_t> tensor_hist;
+        std::unordered_map<uint32_t, uint32_t> hist_tensor;
+        std::vector<std::vector<float> > hist_edge;
+        std::vector<std::vector<uint32_t> > hist_gram;
+
+        /* second loop, create histgram */
+        for (int nums = imgs_list.size() - 1; nums >= 0; nums--)
+        {
+            fprintf(stderr, "\r[Quant Tools Info]: Step 2, images %.5d / %.5d", nums + 1, img_num);
+
+            get_input_data_cv(imgs_list[nums].c_str(), input_data.data(), img_c, img_h, img_w, mean, scale, sw_RGB, center_crop, letterbox_rows, letterbox_cols, focus);
+
+            /* run graph */
+            if (run_graph(ir_graph, 1) < 0)
+            {
+                fprintf(stderr, "Run graph failed\n");
+                return -1;
+            }
+
+            /* calculate hist */
+            uint32_t inum = 0;
+            for (int i = 0; i < ir_graph->tensor_num; i++)
+            {
+                struct tensor* ir_tensor = ir_graph->tensor_list[i];
+                if (ir_tensor->tensor_type == TENSOR_TYPE_VAR || ir_tensor->tensor_type == TENSOR_TYPE_INPUT)
+                {
+                    float step_max = std::abs(max_activation[i]);
+                    if (std::abs(min_activation[i]) > step_max)
+                        step_max = std::abs(min_activation[i]);
+                    float step_bin = step_max / 2048.0f;
+
+                    std::vector<float> every_edge;
+                    if (nums == imgs_list.size() - 1)
+                    {
+                        for (int j = 0; j < 2048; j++)
+                        {
+                            float edge_float = (step_bin * (j + 0.5f));
+                            every_edge.push_back(edge_float);
+                        }
+                        hist_edge.push_back(every_edge);
+                        hist_gram.push_back(histCount((float*)ir_tensor->data, ir_tensor->elem_num, step_max));
+                    }
+                    else
+                    {
+                        std::vector<uint32_t> hist_tmp;
+                        hist_tmp = histCount((float*)ir_tensor->data, ir_tensor->elem_num, step_max);
+                        for (int j = 0; j < 2048; j++)
+                        {
+                            hist_gram[inum][j] += hist_tmp[j];
+                        }
+                    }
+
+                    tensor_hist[i] = inum;
+                    hist_tensor[inum] = i;
+                    inum++;
+                }
+            }
+        }
+
+        fprintf(stderr, "\n");
+
+        /* save the calibration file with min-max algorithm with kl divergence */
+        int fake_quant_set = 127;
+        FILE* fp_kl = fopen("table_kl.scale", "wb");
+        for (int i = 0; i < act_tensor_num; i++)
+        {
+            struct tensor* t = ir_graph->tensor_list[act_map[i]];
+            int threshold_bin = threshold_distribution(hist_gram[i], fake_quant_set + 1);
+            fprintf(stderr, " threshold_bin %d \n", threshold_bin);
+
+            float act_scale = hist_edge[i][threshold_bin] / fake_quant_set;
+            int act_zero_point = 0;
+
+            /* the scale of softmax always is scale = 1 / 127.f */
+            for (int j = 0; j < ir_graph->node_num; j++)
+            {
+                struct node* noden = ir_graph->node_list[j];
+                struct tensor* tensor_tmp = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
+
+                if (!(tensor_tmp->tensor_type == TENSOR_TYPE_INPUT || tensor_tmp->tensor_type == TENSOR_TYPE_VAR))
+                    continue;
+
+                std::string tmp_op_name = get_op_name_from_type(noden->op.type);
+                std::string cur_name = t->name;
+                std::string tmp_name = tensor_tmp->name;
+
+                if ((cur_name == tmp_name) && tmp_op_name == "Softmax")
+                {
+                    act_scale = 1 / 127.f;
+                    act_zero_point = 0;
+                    break;
+                }
+            }
+
+            /* the scale of eltwise */
+            for (int j = 0; j < ir_graph->node_num; j++)
+            {
+                struct node* noden = ir_graph->node_list[j];
+                std::string tmp_op_name = get_op_name_from_type(noden->op.type);
+                if (tmp_op_name == "Eltwise")
+                {
+                    struct tensor* tensor_in0 = get_ir_graph_tensor(ir_graph, noden->input_tensors[0]);
+                    struct tensor* tensor_in1 = get_ir_graph_tensor(ir_graph, noden->input_tensors[1]);
+                    struct tensor* tensor_out = get_ir_graph_tensor(ir_graph, noden->output_tensors[0]);
+
+                    std::string cur_name = t->name;
+                    std::string tmp_name0 = tensor_in0->name;
+                    std::string tmp_name1 = tensor_in1->name;
+
+                    if ((cur_name == tmp_name0 || cur_name == tmp_name1))
+                    {
+                        act_scale = tensor_out->scale;
+                        break;
+                    }
+                }
+            }
+
+            t->scale = act_scale;
+            t->zero_point = 0;
+            fprintf(fp_kl, "%s %f %d\n", t->name, act_scale, act_zero_point);
+        }
+        fclose(fp_kl);
+        fprintf(stderr, "[Quant Tools Info]: Step 2, find calibration table done, output ./table_kl.scale\n");
     }
     else if (this->algorithm_type == ALGORITHM_ACIQ)
     {
@@ -304,7 +431,7 @@ int QuantTool::activation_quant_tool()
         fprintf(stderr, "\r\n[Quant Tools Info]: Step 2, find original calibration minmax threshold table done, output ./table_minmax.scale\n");
     }
 
-    fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num);
+    //    fprintf(stderr, "[Quant Tools Info]: Thread %d, image nums %d, total time %.2f ms, avg time %.2f ms\n", num_thread, img_num, total_time, total_time / img_num);
 
     /* release tengine */
     postrun_graph(ir_graph);
@@ -343,7 +470,7 @@ int main(int argc, char* argv[])
     QuantTool quant_tool;
 
     int res;
-    while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:t:h")) != -1)
+    while ((res = getopt(argc, argv, "m:a:f:o:i:g:s:w:b:c:y:k:z:t:h")) != -1)
     {
         switch (res)
         {
@@ -390,6 +517,9 @@ int main(int argc, char* argv[])
         case 'k':
             quant_tool.focus = atoi(optarg);
             break;
+        case 'z':
+            quant_tool.evaluate = atoi(optarg);
+            break;
         case 't':
             quant_tool.num_thread = atoi(optarg);
             quant_tool.opt.num_thread = atoi(optarg);
@@ -444,35 +574,100 @@ int main(int argc, char* argv[])
     fprintf(stderr, "YOLOv5 focus: %s\n", quant_tool.focus ? "ON" : "OFF");
     fprintf(stderr, "Thread num  : %d\n\n", quant_tool.num_thread);
 
-    /* using 3rd calibration table file */
-    if (quant_tool.scale_file.empty())
+    switch (quant_tool.algorithm_type)
+    {
+    case ALGORITHM_MIN_MAX:
     {
-        /* select algorithm */
-        if (quant_tool.algorithm_type == ALGORITHM_MIN_MAX)
+        if (quant_tool.scale_file.empty())
         {
             quant_tool.scale_file = "table_minmax.scale";
+            quant_tool.activation_quant_tool();
         }
-        else if (quant_tool.algorithm_type == ALGORITHM_KL)
+        save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
+        /* Evaluate quantitative losses */
+        if (quant_tool.evaluate)
+        {
+            fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n");
+            quant_tool.assess_quant_loss(0);
+        }
+        break;
+    }
+    case ALGORITHM_KL:
+    {
+        if (quant_tool.scale_file.empty())
         {
             quant_tool.scale_file = "table_kl.scale";
+            quant_tool.activation_quant_tool();
         }
-        else if (quant_tool.algorithm_type == ALGORITHM_ACIQ)
+        save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
+        /* Evaluate quantitative losses */
+        if (quant_tool.evaluate)
+        {
+            fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n");
+            quant_tool.assess_quant_loss(0);
+        }
+        break;
+    }
+    case ALGORITHM_ACIQ:
+    {
+        if (quant_tool.scale_file.empty())
         {
             quant_tool.scale_file = "table_aciq.scale";
+            quant_tool.activation_quant_tool();
         }
-        else
+        save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
+        /* Evaluate quantitative losses */
+        if (quant_tool.evaluate)
+        {
+            fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n");
+            quant_tool.assess_quant_loss(0);
+        }
+        break;
+    }
+    case ALGORITHM_DFQ:
+    {
+        quant_tool.data_free_quant();
+        quant_tool.model_file = "test_dfq_fp32.tmfile";
+        if (quant_tool.scale_file.empty())
         {
-            fprintf(stderr, "[Quant Tools Info]: algorithm not specified, using default type MIN MAX\n");
             quant_tool.scale_file = "table_minmax.scale";
+            quant_tool.activation_quant_tool();
         }
-
-        /* quantize activation */
-        quant_tool.activation_quant_tool();
+        save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
+        /* Evaluate quantitative losses */
+        if (quant_tool.evaluate)
+        {
+            fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n");
+            quant_tool.assess_quant_loss(0);
+        }
+        break;
+    }
+    case ALGORITHM_MM_EQ:
+    {
+        if (quant_tool.scale_file.empty())
+        {
+            quant_tool.scale_file = "table_minmax.scale";
+            quant_tool.activation_quant_tool();
+        }
+        /* Evaluate quantitative losses */
+        if (quant_tool.evaluate)
+        {
+            fprintf(stderr, "[Quant Tools Info]: Step Evaluate, evaluate quantitative losses\n");
+            quant_tool.assess_quant_loss(0);
+        }
+        /* Enable EQ search */
+        fprintf(stderr, "[Quant Tools Info]: Step Search, enable EQ search\n");
+        quant_tool.quant_search();
+        quant_tool.model_file = "save_i8_eq.tmfile";
+        save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, true);
+        break;
+    }
+    default:
+    {
+        fprintf(stderr, "Unsupported quantization type ... \n");
+        break;
+    }
     }
-
-    /* quantize weight/bias and save into int8 tmfile */
-    fprintf(stderr, "[Quant Tools Info]: Calibration file is using %s\n", quant_tool.scale_file.c_str());
-    save_graph_i8_perchannel(quant_tool.model_file.c_str(), quant_tool.scale_file.c_str(), quant_tool.output_file, quant_tool.inplace, false);
 
     fprintf(stderr, "\n---- Tengine Int8 tmfile create success, best wish for your INT8 inference has a low accuracy loss...\\(^0^)/ ----\n");
 
diff --git a/tools/quantize/quant_utils.cpp b/tools/quantize/quant_utils.cpp
index 0f60d3838..f938f3617 100644
--- a/tools/quantize/quant_utils.cpp
+++ b/tools/quantize/quant_utils.cpp
@@ -77,7 +77,7 @@ void split(float* array, char* str, const char* del)
 }
 
 void get_input_data_cv(const char* image_file, float* input_data, int img_c, int img_h, int img_w, const float* mean,
-                       const float* scale, int sw_RGB = 0, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0)
+                       const float* scale, int sw_RGB = 1, int center_crop = 0, int letterbox_rows = 0, int letterbox_cols = 0, int focus = 0)
 {
     /* only for yolov5s */
     if (focus == 1 && letterbox_rows > 0 && letterbox_cols > 0)
@@ -411,6 +411,22 @@ std::vector<uint32_t> histCount(float* data, uint32_t elem_num, float max_val, f
     return hist;
 }
 
+std::vector<uint32_t> histCount(float* data, uint32_t elem_num, float abs_max)
+{
+    float bin_scale = abs_max / 2047.f;
+    int bin_zp = 0;
+    std::vector<uint32_t> hist(2048);
+    for (int i = 0; i < elem_num; i++)
+    {
+        if (data[i] != 0)
+        {
+            uint32_t hist_idx = round(std::abs(data[i]) / bin_scale);
+            hist[hist_idx]++;
+        }
+    }
+    return hist;
+}
+
 float compute_kl_divergence(std::vector<float>& dist_a, std::vector<float>& dist_b)
 {
     const size_t length = dist_a.size();
diff --git a/tools/quantize/quant_utils.hpp b/tools/quantize/quant_utils.hpp
index 4ad636763..df529181a 100644
--- a/tools/quantize/quant_utils.hpp
+++ b/tools/quantize/quant_utils.hpp
@@ -40,6 +40,7 @@ void get_input_data_cv(const char* image_file, float* input_data, int img_c, int
 void readFileList(std::string basePath, std::vector<std::string>& imgs);
 
 std::vector<uint32_t> histCount(float* data, uint32_t elem_num, float max_val, float min_val);
+std::vector<uint32_t> histCount(float* data, uint32_t elem_num, float abs_max);
 
 float compute_kl_divergence(std::vector<float>& dist_a, std::vector<float>& dist_b);