From f1e7bff5292b2ddec5911ee2202f06b2823cde4b Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Fri, 11 Nov 2022 04:00:16 +0000 Subject: [PATCH 01/16] add onnx_ort_runtime demo --- benchmark/benchmark_ppcls.py | 6 +- benchmark/benchmark_ppdet.py | 8 +-- benchmark/benchmark_ppseg.py | 6 +- benchmark/benchmark_yolo.py | 8 +-- examples/runtime/README.md | 4 +- .../runtime/cpp/infer_onnx_onnxruntime.cc | 64 +++++++++++++++++++ .../runtime/cpp/infer_paddle_onnxruntime.cc | 5 ++ .../runtime/python/infer_onnx_onnxruntime.py | 47 ++++++++++++++ 8 files changed, 131 insertions(+), 17 deletions(-) create mode 100644 examples/runtime/cpp/infer_onnx_onnxruntime.cc create mode 100644 examples/runtime/python/infer_onnx_onnxruntime.py diff --git a/benchmark/benchmark_ppcls.py b/benchmark/benchmark_ppcls.py index 039a07cc9e..8eeeb8cfca 100755 --- a/benchmark/benchmark_ppcls.py +++ b/benchmark/benchmark_ppcls.py @@ -16,9 +16,6 @@ import cv2 import os import numpy as np -import pynvml -import psutil -import GPUtil import time @@ -112,6 +109,8 @@ def build_option(args): def get_current_memory_mb(gpu_id=None): + import pynvml + import psutil pid = os.getpid() p = psutil.Process(pid) info = p.memory_full_info() @@ -126,6 +125,7 @@ def get_current_memory_mb(gpu_id=None): def get_current_gputil(gpu_id): + import GPUtil GPUs = GPUtil.getGPUs() gpu_load = GPUs[gpu_id].load return gpu_load diff --git a/benchmark/benchmark_ppdet.py b/benchmark/benchmark_ppdet.py index 6cabc4d4e9..6d08aafb8a 100755 --- a/benchmark/benchmark_ppdet.py +++ b/benchmark/benchmark_ppdet.py @@ -16,11 +16,6 @@ import cv2 import os import numpy as np -import datetime -import json -import pynvml -import psutil -import GPUtil import time @@ -114,6 +109,8 @@ def build_option(args): def get_current_memory_mb(gpu_id=None): + import pynvml + import psutil pid = os.getpid() p = psutil.Process(pid) info = p.memory_full_info() @@ -128,6 +125,7 @@ def get_current_memory_mb(gpu_id=None): def get_current_gputil(gpu_id): + import GPUtil GPUs = GPUtil.getGPUs() gpu_load = GPUs[gpu_id].load return gpu_load diff --git a/benchmark/benchmark_ppseg.py b/benchmark/benchmark_ppseg.py index ef57e37150..7d9df9f077 100755 --- a/benchmark/benchmark_ppseg.py +++ b/benchmark/benchmark_ppseg.py @@ -16,9 +16,6 @@ import cv2 import os import numpy as np -import pynvml -import psutil -import GPUtil import time @@ -112,6 +109,8 @@ def build_option(args): def get_current_memory_mb(gpu_id=None): + import pynvml + import psutil pid = os.getpid() p = psutil.Process(pid) info = p.memory_full_info() @@ -126,6 +125,7 @@ def get_current_memory_mb(gpu_id=None): def get_current_gputil(gpu_id): + import GPUtil GPUs = GPUtil.getGPUs() gpu_load = GPUs[gpu_id].load return gpu_load diff --git a/benchmark/benchmark_yolo.py b/benchmark/benchmark_yolo.py index aa6927c833..dd63cefb65 100755 --- a/benchmark/benchmark_yolo.py +++ b/benchmark/benchmark_yolo.py @@ -16,11 +16,6 @@ import cv2 import os import numpy as np -import datetime -import json -import pynvml -import psutil -import GPUtil import time @@ -114,6 +109,8 @@ def build_option(args): def get_current_memory_mb(gpu_id=None): + import pynvml + import psutil pid = os.getpid() p = psutil.Process(pid) info = p.memory_full_info() @@ -128,6 +125,7 @@ def get_current_memory_mb(gpu_id=None): def get_current_gputil(gpu_id): + import GPUtil GPUs = GPUtil.getGPUs() gpu_load = GPUs[gpu_id].load return gpu_load diff --git a/examples/runtime/README.md b/examples/runtime/README.md index b434bc99eb..2f739b8609 100755 --- a/examples/runtime/README.md +++ b/examples/runtime/README.md @@ -1,6 +1,6 @@ # FastDeploy Runtime examples -FastDeploy Runtime C++ 推理示例如下 +FastDeploy Runtime 推理示例如下 ## Python 示例 @@ -12,6 +12,7 @@ FastDeploy Runtime C++ 推理示例如下 | python/infer_paddle_onnxruntime.py | Python | Deploy Paddle model with ONNX Runtime(CPU/GPU) | | python/infer_onnx_openvino.py | Python | Deploy ONNX model with OpenVINO(CPU) | | python/infer_onnx_tensorrt.py | Python | Deploy ONNX model with TensorRT(GPU) | +| python/infer_onnx_onnxruntime.py | Python | Deploy ONNX model with ONNX Runtime(CPU/GPU) | ## C++ 示例 @@ -23,6 +24,7 @@ FastDeploy Runtime C++ 推理示例如下 | cpp/infer_paddle_onnxruntime.cc | C++ | Deploy Paddle model with ONNX Runtime(CPU/GPU) | | cpp/infer_onnx_openvino.cc | C++ | Deploy ONNX model with OpenVINO(CPU) | | cpp/infer_onnx_tensorrt.cc | C++ | Deploy ONNX model with TensorRT(GPU) | +| cpp/infer_onnx_onnxruntime.cc | C++ | Deploy ONNX model with ONNX Runtime(CPU/GPU) | ## 详细部署文档 diff --git a/examples/runtime/cpp/infer_onnx_onnxruntime.cc b/examples/runtime/cpp/infer_onnx_onnxruntime.cc new file mode 100644 index 0000000000..4c27c1f65f --- /dev/null +++ b/examples/runtime/cpp/infer_onnx_onnxruntime.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fastdeploy/runtime.h" + +namespace fd = fastdeploy; + +int main(int argc, char* argv[]) { + std::string model_file = "mobilenetv2.onnx"; + + // setup option + fd::RuntimeOption runtime_option; + runtime_option.SetModelPath(model_file, "", fd::ModelFormat::ONNX); + runtime_option.UseOrtBackend(); + runtime_option.SetCpuThreadNum(12); + + // **** GPU **** + // To use GPU, use the following commented code + // runtime_option.UseGpu(0); + + // init runtime + std::unique_ptr runtime = + std::unique_ptr(new fd::Runtime()); + if (!runtime->Init(runtime_option)) { + std::cerr << "--- Init FastDeploy Runitme Failed! " + << "\n--- Model: " << model_file << std::endl; + return -1; + } else { + std::cout << "--- Init FastDeploy Runitme Done! " + << "\n--- Model: " << model_file << std::endl; + } + // init input tensor shape + fd::TensorInfo info = runtime->GetInputInfo(0); + info.shape = {1, 3, 224, 224}; + + std::vector input_tensors(1); + std::vector output_tensors(1); + + std::vector inputs_data; + inputs_data.resize(1 * 3 * 224 * 224); + for (size_t i = 0; i < inputs_data.size(); ++i) { + inputs_data[i] = std::rand() % 1000 / 1000.0f; + } + input_tensors[0].SetExternalData({1, 3, 224, 224}, fd::FDDataType::FP32, inputs_data.data()); + + //get input name + input_tensors[0].name = info.name; + + runtime->Infer(input_tensors, &output_tensors); + + output_tensors[0].PrintInfo(); + return 0; +} \ No newline at end of file diff --git a/examples/runtime/cpp/infer_paddle_onnxruntime.cc b/examples/runtime/cpp/infer_paddle_onnxruntime.cc index d8d036a034..612966d736 100644 --- a/examples/runtime/cpp/infer_paddle_onnxruntime.cc +++ b/examples/runtime/cpp/infer_paddle_onnxruntime.cc @@ -25,6 +25,11 @@ int main(int argc, char* argv[]) { runtime_option.SetModelPath(model_file, params_file, fd::ModelFormat::PADDLE); runtime_option.UseOrtBackend(); runtime_option.SetCpuThreadNum(12); + + // **** GPU **** + // To use GPU, use the following commented code + // runtime_option.UseGpu(0); + // init runtime std::unique_ptr runtime = std::unique_ptr(new fd::Runtime()); diff --git a/examples/runtime/python/infer_onnx_onnxruntime.py b/examples/runtime/python/infer_onnx_onnxruntime.py new file mode 100644 index 0000000000..ccb3ce5ec4 --- /dev/null +++ b/examples/runtime/python/infer_onnx_onnxruntime.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +from fastdeploy import ModelFormat +import numpy as np + +# 下载模型并解压 +model_url = "https://bj.bcebos.com/fastdeploy/models/mobilenetv2.onnx" +fd.download(model_url, path=".") + +option = fd.RuntimeOption() + +option.set_model_path("mobilenetv2.onnx", model_format=ModelFormat.ONNX) + +# **** CPU 配置 **** +option.use_cpu() +option.use_ort_backend() +option.set_cpu_thread_num(12) + +# **** GPU 配置 **** +# 如需使用GPU,使用如下注释代码 +# option.use_gpu(0) + +# 初始化构造runtime +runtime = fd.Runtime(option) + +# 获取模型输入名 +input_name = runtime.get_input_info(0).name + +# 构造随机数据进行推理 +results = runtime.infer({ + input_name: np.random.rand(1, 3, 224, 224).astype("float32") +}) + +print(results[0].shape) From b5dff8df7e8d2deb050fd0a8916b865cd71ecd5a Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Fri, 11 Nov 2022 04:13:15 +0000 Subject: [PATCH 02/16] rm in requirements --- benchmark/requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt index 9d78d39fed..24ce15ab7e 100644 --- a/benchmark/requirements.txt +++ b/benchmark/requirements.txt @@ -1,4 +1 @@ numpy -pynvml -psutil -GPUtil From 4e1f35a4b5e664d727424e51cfca74ea31f47b83 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Mon, 14 Nov 2022 11:59:07 +0000 Subject: [PATCH 03/16] support batch eval --- .../fastdeploy/vision/evaluation/detection.py | 53 ++++++++++++++----- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/python/fastdeploy/vision/evaluation/detection.py b/python/fastdeploy/vision/evaluation/detection.py index 98c6794fea..a13e0429e0 100644 --- a/python/fastdeploy/vision/evaluation/detection.py +++ b/python/fastdeploy/vision/evaluation/detection.py @@ -23,7 +23,8 @@ def eval_detection(model, ann_file, conf_threshold=None, nms_iou_threshold=None, - plot=False): + plot=False, + batch_size=1): from .utils import CocoDetection from .utils import COCOMetric import cv2 @@ -54,6 +55,8 @@ def eval_detection(model, start_time = 0 end_time = 0 average_inference_time = 0 + im_list = list() + im_id_list = list() for image_info, i in zip(all_image_info, trange( image_num, desc="Inference Progress")): @@ -61,19 +64,43 @@ def eval_detection(model, start_time = time.time() im = cv2.imread(image_info["image"]) im_id = image_info["im_id"] - if conf_threshold is None and nms_iou_threshold is None: - result = model.predict(im.copy()) + if batch_size == 1: + if conf_threshold is None and nms_iou_threshold is None: + result = model.predict(im.copy()) + else: + result = model.predict(im, conf_threshold, nms_iou_threshold) + pred = { + 'bbox': [[c] + [s] + b + for b, s, c in zip(result.boxes, result.scores, + result.label_ids)], + 'bbox_num': len(result.boxes), + 'im_id': im_id + } + eval_metric.update(im_id, pred) else: - result = model.predict(im, conf_threshold, nms_iou_threshold) - pred = { - 'bbox': - [[c] + [s] + b - for b, s, c in zip(result.boxes, result.scores, result.label_ids) - ], - 'bbox_num': len(result.boxes), - 'im_id': im_id - } - eval_metric.update(im_id, pred) + im_list.append(im) + im_id_list.append(im_id) + # If the batch_size is not satisfied, the remaining pictures are formed into a batch + if (i + 1) % batch_size != 0 and i != image_num - 1: + continue + if conf_threshold is None and nms_iou_threshold is None: + results = model.batch_predict(im_list) + else: + model.postprocessor.conf_threshold = conf_threshold + model.postprocessor.nms_threshold = nms_iou_threshold + results = model.batch_predict(im_list) + for k in range(len(im_list)): + pred = { + 'bbox': [[c] + [s] + b + for b, s, c in zip(results[k].boxes, results[ + k].scores, results[k].label_ids)], + 'bbox_num': len(results[k].boxes), + 'im_id': im_id_list[k] + } + eval_metric.update(im_id_list[k], pred) + im_list.clear() + im_id_list.clear() + if i == image_num - 1: end_time = time.time() average_inference_time = round( From 99c610c4924a4af205ea139be022fda5471d8e3e Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Tue, 15 Nov 2022 07:36:38 +0000 Subject: [PATCH 04/16] fixed MattingResults bug --- fastdeploy/vision/common/result.cc | 4 ++-- tests/models/test_rvm.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc index 3585713a67..4d746797f4 100755 --- a/fastdeploy/vision/common/result.cc +++ b/fastdeploy/vision/common/result.cc @@ -372,7 +372,7 @@ void MattingResult::Reserve(int size) { if (contain_foreground) { FDASSERT((shape.size() == 3), "Please initial shape (h,w,c) before call Reserve."); - int c = static_cast(shape[3]); + int c = static_cast(shape[2]); foreground.reserve(size * c); } } @@ -382,7 +382,7 @@ void MattingResult::Resize(int size) { if (contain_foreground) { FDASSERT((shape.size() == 3), "Please initial shape (h,w,c) before call Resize."); - int c = static_cast(shape[3]); + int c = static_cast(shape[2]); foreground.resize(size * c); } } diff --git a/tests/models/test_rvm.py b/tests/models/test_rvm.py index 10d680948f..4fa3083e59 100644 --- a/tests/models/test_rvm.py +++ b/tests/models/test_rvm.py @@ -19,6 +19,7 @@ import numpy as np import runtime_config as rc + def test_matting_rvm_cpu(): model_url = "https://bj.bcebos.com/paddlehub/fastdeploy/rvm.tgz" input_url = "https://bj.bcebos.com/paddlehub/fastdeploy/video.mp4" @@ -38,7 +39,8 @@ def test_matting_rvm_cpu(): break result = model.predict(frame) # compare diff - expect_alpha = np.load("resources/rvm/result_alpha_" + str(frame_id) + ".npy") + expect_alpha = np.load("resources/rvm/result_alpha_" + str(frame_id) + + ".npy") result_alpha = np.array(result.alpha).reshape(1920, 1080) diff = np.fabs(expect_alpha - result_alpha) thres = 1e-05 @@ -51,3 +53,7 @@ def test_matting_rvm_cpu(): cap.release() cv2.destroyAllWindows() break + + +if __name__ == "__main__": + test_matting_rvm_cpu() From c59fd5c5e2dfa01b6553138044357405e77eb8c0 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Tue, 15 Nov 2022 12:32:26 +0000 Subject: [PATCH 05/16] move assignment for DetectionResult --- fastdeploy/vision/common/result.cc | 14 ++++++++++++++ fastdeploy/vision/common/result.h | 5 ++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc index 4d746797f4..249c8e7452 100755 --- a/fastdeploy/vision/common/result.cc +++ b/fastdeploy/vision/common/result.cc @@ -80,6 +80,20 @@ DetectionResult::DetectionResult(const DetectionResult& res) { } } +DetectionResult& DetectionResult::operator=(DetectionResult&& other) { + if (&other != this) { + boxes = std::move(other.boxes); + scores = std::move(other.scores); + label_ids = std::move(other.label_ids); + contain_masks = std::move(other.contain_masks); + if (contain_masks) { + masks.clear(); + masks = std::move(other.masks); + } + } + return *this; +} + void DetectionResult::Clear() { std::vector>().swap(boxes); std::vector().swap(scores); diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h index 59690ab6d8..27f6a97709 100755 --- a/fastdeploy/vision/common/result.h +++ b/fastdeploy/vision/common/result.h @@ -95,6 +95,7 @@ struct FASTDEPLOY_DECL Mask : public BaseResult { /*! @brief Detection result structure for all the object detection models and instance segmentation models */ struct FASTDEPLOY_DECL DetectionResult : public BaseResult { + DetectionResult() = default; /** \brief All the detected object boxes for an input image, the size of `boxes` is the number of detected objects, and the element of `boxes` is a array of 4 float values, means [xmin, ymin, xmax, ymax] */ std::vector> boxes; @@ -111,8 +112,10 @@ struct FASTDEPLOY_DECL DetectionResult : public BaseResult { ResultType type = ResultType::DETECTION; - DetectionResult() {} + /// Copy constructor DetectionResult(const DetectionResult& res); + /// Move assignment + DetectionResult& operator=(DetectionResult&& other); /// Clear detection result void Clear(); From 0371811ed947a545793fdfcef8fbe4ef23bb3945 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Fri, 18 Nov 2022 04:00:34 +0000 Subject: [PATCH 06/16] integrated x2paddle --- tools/common_tools/common_tools.py | 93 +++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 8 deletions(-) mode change 100644 => 100755 tools/common_tools/common_tools.py diff --git a/tools/common_tools/common_tools.py b/tools/common_tools/common_tools.py old mode 100644 new mode 100755 index 04fa215f72..06d704481a --- a/tools/common_tools/common_tools.py +++ b/tools/common_tools/common_tools.py @@ -1,8 +1,8 @@ import argparse +import ast def argsparser(): - parser = argparse.ArgumentParser(description=__doc__) ## argumentments for auto compression parser.add_argument('--auto_compress', default=False, action='store_true') @@ -10,31 +10,71 @@ def argsparser(): '--config_path', type=str, default=None, - help="path of compression strategy config.", - required=True) + help="path of compression strategy config.") parser.add_argument( '--method', type=str, default=None, - help="choose PTQ or QAT as quantization method", - required=True) + help="choose PTQ or QAT as quantization method") parser.add_argument( '--save_dir', type=str, default='./output', - help="directory to save compressed model.") + help="directory to save model.") parser.add_argument( '--devices', type=str, default='gpu', help="which device used to compress.") - + ## arguments for other x2paddle + parser.add_argument('--convert', default=False, action='store_true') + parser.add_argument( + '--framework', + type=str, + default=None, + help="define which deeplearning framework(tensorflow/caffe/onnx)") + parser.add_argument( + '--model', + type=str, + default=None, + help="define model file path for tensorflow or onnx") + parser.add_argument( + "--prototxt", + "-p", + type=str, + default=None, + help="prototxt file of caffe model") + parser.add_argument( + "--weight", + "-w", + type=str, + default=None, + help="weight file of caffe model") + parser.add_argument( + "--caffe_proto", + "-c", + type=str, + default=None, + help="optional: the .py file compiled by caffe proto file of caffe model" + ) + parser.add_argument( + "--input_shape_dict", + "-isd", + type=str, + default=None, + help="define input shapes, e.g --input_shape_dict=\"{'image':[1, 3, 608, 608]}\" or" \ + "--input_shape_dict=\"{'image':[1, 3, 608, 608], 'im_shape': [1, 2], 'scale_factor': [1, 2]}\"") + parser.add_argument( + "--enable_code_optim", + "-co", + type=ast.literal_eval, + default=False, + help="Turn on code optimization") ## arguments for other tools return parser def main(): - args = argsparser().parse_args() if args.auto_compress == True: try: @@ -45,6 +85,43 @@ def main(): print( "Can not start auto compresssion successfully! Please check if you have installed it!" ) + if args.convert == True: + try: + import platform + import logging + v0, v1, v2 = platform.python_version().split('.') + if not (int(v0) >= 3 and int(v1) >= 5): + logging.info("[ERROR] python>=3.5 is required") + return + import paddle + v0, v1, v2 = paddle.__version__.split('.') + logging.info("paddle.__version__ = {}".format(paddle.__version__)) + if v0 == '0' and v1 == '0' and v2 == '0': + logging.info( + "[WARNING] You are use develop version of paddlepaddle") + elif int(v0) != 2 or int(v1) < 0: + logging.info("[ERROR] paddlepaddle>=2.0.0 is required") + return + from x2paddle.convert import tf2paddle, caffe2paddle, onnx2paddle + if args.framework == "tensorflow": + assert args.model is not None, "--model should be defined while convert tensorflow model" + tf2paddle(args.model, args.save_dir) + elif args.framework == "caffe": + assert args.prototxt is not None and args.weight is not None, "--prototxt and --weight should be defined while convert caffe model" + caffe2paddle(args.prototxt, args.weight, args.save_dir, + args.caffe_proto) + elif args.framework == "onnx": + assert args.model is not None, "--model should be defined while convert onnx model" + onnx2paddle( + args.model, + args.save_dir, + input_shape_dict=args.input_shape_dict) + else: + raise Exception( + "--framework only support tensorflow/caffe/onnx now") + except ImportError: + print( + "Model convert failed! Please check if you have installed it!") if __name__ == '__main__': From cb9c9666b77336618cb7df53fb6044d3f6b2bc51 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Sun, 20 Nov 2022 03:51:25 +0000 Subject: [PATCH 07/16] add model convert readme --- .../download_prebuilt_libraries.md | 8 ++- .../download_prebuilt_libraries.md | 8 ++- tools/README.md | 49 ++++++++++++++++++- tools/README_EN.md | 48 +++++++++++++++++- 4 files changed, 105 insertions(+), 8 deletions(-) mode change 100644 => 100755 tools/README.md mode change 100644 => 100755 tools/README_EN.md diff --git a/docs/cn/build_and_install/download_prebuilt_libraries.md b/docs/cn/build_and_install/download_prebuilt_libraries.md index f296363e6b..105e0f0e71 100755 --- a/docs/cn/build_and_install/download_prebuilt_libraries.md +++ b/docs/cn/build_and_install/download_prebuilt_libraries.md @@ -3,8 +3,10 @@ FastDeploy提供各平台预编译库,供开发者直接下载安装使用。当然FastDeploy编译也非常容易,开发者也可根据自身需求编译FastDeploy。 本文分为两部分: -- [1.GPU部署环境](##GPU部署环境) -- [2.CPU部署环境](##CPU部署环境) +- [1.GPU部署环境](#1) +- [2.CPU部署环境](#2) + +

## GPU部署环境 @@ -49,6 +51,8 @@ Develop版本(Nightly build) | Linux x64 | [fastdeploy-linux-x64-gpu-0.0.0.tgz](https://fastdeploy.bj.bcebos.com/dev/cpp/fastdeploy-linux-x64-gpu-0.0.0.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2编译产出 | | Windows x64 | [fastdeploy-win-x64-gpu-0.0.0.zip](https://fastdeploy.bj.bcebos.com/dev/cpp/fastdeploy-win-x64-gpu-0.0.0.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2编译产出 | +

+ ## CPU部署环境 ### 环境要求 diff --git a/docs/en/build_and_install/download_prebuilt_libraries.md b/docs/en/build_and_install/download_prebuilt_libraries.md index 565d39d93d..f7f161dfe6 100755 --- a/docs/en/build_and_install/download_prebuilt_libraries.md +++ b/docs/en/build_and_install/download_prebuilt_libraries.md @@ -3,8 +3,10 @@ FastDeploy provides pre-built libraries for developers to download and install directly. Meanwhile, FastDeploy also offers easy access to compile so that developers can compile FastDeploy according to their own needs. This article is divided into two parts: -- [1.GPU Deployment Environment](#gpu-deployment-environment) -- [2.CPU Deployment Environment](#cpu-deployment-environment) +- [1.GPU Deployment Environment](#1) +- [2.CPU Deployment Environment](#2) + +

## GPU Deployment Environment @@ -53,6 +55,8 @@ Install the Develop version(Nightly build) | Linux x64 | [fastdeploy-linux-x64-gpu-0.0.0.tgz](https://fastdeploy.bj.bcebos.com/dev/cpp/fastdeploy-linux-x64-gpu-0.0.0.tgz) | g++ 8.2, CUDA 11.2, cuDNN 8.2 | | Windows x64 | [fastdeploy-win-x64-gpu-0.0.0.zip](https://fastdeploy.bj.bcebos.com/dev/cpp/fastdeploy-win-x64-gpu-0.0.0.zip) | Visual Studio 16 2019, CUDA 11.2, cuDNN 8.2 | +

+ ## CPU Deployment Environment ### Environment Requirement diff --git a/tools/README.md b/tools/README.md old mode 100644 new mode 100755 index c166b8f9f0..c0009355c8 --- a/tools/README.md +++ b/tools/README.md @@ -1,8 +1,14 @@ # FastDeploy 工具包 FastDeploy提供了一系列高效易用的工具优化部署体验, 提升推理性能. -例如, FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具, 用户可以轻松地通过一行命令对模型进行自动化压缩, 并在FastDeploy上部署压缩后的模型, 提升推理速度. 本文档将以FastDeploy一键模型自动化压缩工具为例, 介绍如何安装此工具, 并提供相应的使用文档. -## FastDeploy一键模型自动化压缩工具 +- [1.自动压缩工具包](#1) +- [2.模型转换工具包](#2) + +

+ +## 一键模型自动化压缩工具 + +FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了一键模型自动化压缩的工具, 用户可以轻松地通过一行命令对模型进行自动化压缩, 并在FastDeploy上部署压缩后的模型, 提升推理速度. 本文档将以FastDeploy一键模型自动化压缩工具为例, 介绍如何安装此工具, 并提供相应的使用文档. ### 环境准备 1.用户参考PaddlePaddle官网, 安装develop版本 @@ -33,3 +39,42 @@ python setup.py install fastdeploy --auto_compress --config_path=./configs/detection/yolov5s_quant.yaml --method='PTQ' --save_dir='./yolov5s_ptq_model/' ``` 详细使用文档请参考[FastDeploy一键模型自动化压缩工具](./auto_compression/README.md) + +

+ +## 模型转换工具 + +FastDeploy 基于 X2Paddle 为用户提供了模型转换的工具, 用户可以轻松地通过一行命令将外部框架模型快速迁移至飞桨框架,目前支持 ONNX、TensorFlow 以及 Caffe,支持大部分主流的CV和NLP的模型转换。 + +### 环境准备 + +1. PaddlePaddle 安装 + +参考如下文档快速安装 +``` +https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html +``` + +2. X2Paddle 安装 + +如需使用稳定版本,可通过pip方式安装X2Paddle: +```shell +pip install x2paddle +``` + +如需体验最新功能,可使用源码安装方式: +```shell +git clone https://github.com/PaddlePaddle/X2Paddle.git +cd X2Paddle +python setup.py install +``` + +### 使用方式 + +按照以上步骤成功安装后,即可使用 FastDeploy 一键转换工具, 示例如下: + +```bash +fastdeploy --convert --framework onnx --model yolov5s.onnx --save_dir pd_model +``` + +更多详细内容可参考[X2Paddle](https://github.com/PaddlePaddle/X2Paddle) diff --git a/tools/README_EN.md b/tools/README_EN.md old mode 100644 new mode 100755 index 3a8313fa9f..fd021619ac --- a/tools/README_EN.md +++ b/tools/README_EN.md @@ -1,9 +1,14 @@ # FastDeploy Toolkit FastDeploy provides a series of efficient and easy-to-use tools to optimize the deployment experience and improve inference performance. -For example, based on PaddleSlim's Auto Compression Toolkit (ACT), FastDeploy provides users with a one-click model automation compression tool that allows users to easily compress the model with a single command. This document will take FastDeploy's one-click model automation compression tool as an example, introduce how to install the tool, and provide the corresponding documentation for usage. +- [1.Auto Compression Tool](#1) +- [2.Model Conversion Tool](#2) -## FastDeploy One-Click Model Auto Compression Tool +

+ +## One-Click Model Auto Compression Tool + +Based on PaddleSlim's Auto Compression Toolkit (ACT), FastDeploy provides users with a one-click model automation compression tool that allows users to easily compress the model with a single command. This document will take FastDeploy's one-click model automation compression tool as an example, introduce how to install the tool, and provide the corresponding documentation for usage. ### Environmental Preparation 1.Install PaddlePaddle develop version @@ -33,3 +38,42 @@ After the above steps are successfully installed, you can use FastDeploy one-cli fastdeploy --auto_compress --config_path=./configs/detection/yolov5s_quant.yaml --method='PTQ' --save_dir='./yolov5s_ptq_model/' ``` For detailed documentation, please refer to [FastDeploy One-Click Model Auto Compression Tool](./auto_compression/README.md) + +

+ +## Model Conversion Tool + +Based on X2Paddle, FastDeploy provides users with a model conversion tool. Users can easily migrate external framework models to the Paddle framework with one line of commands. Currently, ONNX, TensorFlow and Caffe are supported, and most mainstream CV and NLP model conversions are supported. + +### Environmental Preparation + +1. Install PaddlePaddle + +Refer to the following documents for quick installation +``` +https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html +``` + +2. Install X2Paddle + +To use the stable version, install X2Paddle via pip: +```shell +pip install x2paddle +``` + +To experience the latest features, you can use the source installation method: +```shell +git clone https://github.com/PaddlePaddle/X2Paddle.git +cd X2Paddle +python setup.py install +``` + +### How to use + +After successful installation according to the above steps, you can use the FastDeploy one-click conversion tool. The example is as follows: + +```bash +fastdeploy --convert --framework onnx --model yolov5s.onnx --save_dir pd_model +``` + +For more details, please refer to[X2Paddle](https://github.com/PaddlePaddle/X2Paddle) From 670cefb86f20cbf4fc6f07392c48e5c3ec58fb97 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Sun, 20 Nov 2022 03:56:57 +0000 Subject: [PATCH 08/16] update readme --- tools/README.md | 4 +--- tools/README_EN.md | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/README.md b/tools/README.md index c0009355c8..7a1a36853b 100755 --- a/tools/README.md +++ b/tools/README.md @@ -48,9 +48,7 @@ FastDeploy 基于 X2Paddle 为用户提供了模型转换的工具, 用户可以 ### 环境准备 -1. PaddlePaddle 安装 - -参考如下文档快速安装 +1. PaddlePaddle 安装,可参考如下文档快速安装 ``` https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html ``` diff --git a/tools/README_EN.md b/tools/README_EN.md index fd021619ac..26305ac914 100755 --- a/tools/README_EN.md +++ b/tools/README_EN.md @@ -47,9 +47,7 @@ Based on X2Paddle, FastDeploy provides users with a model conversion tool. Users ### Environmental Preparation -1. Install PaddlePaddle - -Refer to the following documents for quick installation +1. Install PaddlePaddle, refer to the following documents for quick installation ``` https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html ``` From 6e9e6a511a13d600889595ccb57bb4c27b2c31f8 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Sun, 20 Nov 2022 03:59:16 +0000 Subject: [PATCH 09/16] re-lint From b0f45f7e4062fb27e92dac044913741b2850ddcb Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Tue, 22 Nov 2022 11:54:22 +0000 Subject: [PATCH 10/16] add processor api --- docs/api_docs/python/image_classification.md | 16 +++++++ docs/api_docs/python/object_detection.md | 48 ++++++++++++++++++++ 2 files changed, 64 insertions(+) mode change 100644 => 100755 docs/api_docs/python/object_detection.md diff --git a/docs/api_docs/python/image_classification.md b/docs/api_docs/python/image_classification.md index 6b18f5c09f..46fc09a50e 100755 --- a/docs/api_docs/python/image_classification.md +++ b/docs/api_docs/python/image_classification.md @@ -1,5 +1,21 @@ # Image Classification(图像分类) +## fastdeploy.vision.classification.PaddleClasPreprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.classification.PaddleClasPreprocessor + :members: + :inherited-members: +``` + +## fastdeploy.vision.classification.PaddleClasPostprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.classification.PaddleClasPostprocessor + :members: + :inherited-members: +``` + ## fastdeploy.vision.classification.PaddleClasModel ```{eval-rst} diff --git a/docs/api_docs/python/object_detection.md b/docs/api_docs/python/object_detection.md old mode 100644 new mode 100755 index efe7b5923f..998ae13964 --- a/docs/api_docs/python/object_detection.md +++ b/docs/api_docs/python/object_detection.md @@ -1,5 +1,21 @@ # Object Detection(目标检测) +## fastdeploy.vision.detection.PaddleDetPreprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.detection.PaddleDetPreprocessor + :members: + :inherited-members: +``` + +## fastdeploy.vision.detection.PaddleDetPostprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.detection.PaddleDetPostprocessor + :members: + :inherited-members: +``` + ## fastdeploy.vision.detection.PPYOLOE ```{eval-rst} @@ -80,6 +96,22 @@ :inherited-members: ``` +## fastdeploy.vision.detection.YOLOv5Preprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.detection.YOLOv5Preprocessor + :members: + :inherited-members: +``` + +## fastdeploy.vision.detection.YOLOv5Postprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.detection.YOLOv5Postprocessor + :members: + :inherited-members: +``` + ## fastdeploy.vision.detection.YOLOv5 ```{eval-rst} @@ -104,6 +136,22 @@ :inherited-members: ``` +## fastdeploy.vision.detection.YOLOv7Preprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.detection.YOLOv7Preprocessor + :members: + :inherited-members: +``` + +## fastdeploy.vision.detection.YOLOv7Postprocessor + +```{eval-rst} +.. autoclass:: fastdeploy.vision.detection.YOLOv7Postprocessor + :members: + :inherited-members: +``` + ## fastdeploy.vision.detection.YOLOv7 ```{eval-rst} From b85cc13e716ade815a1e19c0c3b956fc12338423 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Fri, 25 Nov 2022 09:25:21 +0000 Subject: [PATCH 11/16] Add MattingResult Free --- fastdeploy/vision/common/result.cc | 7 +++++++ fastdeploy/vision/common/result.h | 5 ++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fastdeploy/vision/common/result.cc b/fastdeploy/vision/common/result.cc index ee13760489..0211720bed 100755 --- a/fastdeploy/vision/common/result.cc +++ b/fastdeploy/vision/common/result.cc @@ -378,6 +378,13 @@ MattingResult::MattingResult(const MattingResult& res) { } void MattingResult::Clear() { + alpha.clear(); + foreground.clear(); + shape.clear(); + contain_foreground = false; +} + +void MattingResult::Free() { std::vector().swap(alpha); std::vector().swap(foreground); std::vector().swap(shape); diff --git a/fastdeploy/vision/common/result.h b/fastdeploy/vision/common/result.h index 27f6a97709..2fd3d72dd9 100755 --- a/fastdeploy/vision/common/result.h +++ b/fastdeploy/vision/common/result.h @@ -316,9 +316,12 @@ struct FASTDEPLOY_DECL MattingResult : public BaseResult { MattingResult() {} MattingResult(const MattingResult& res); - /// Clear detection result + /// Clear matting result void Clear(); + /// Free matting result + void Free(); + void Reserve(int size); void Resize(int size); From 52180e7d400772be2a958703b2a7866928fa08eb Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Mon, 28 Nov 2022 09:02:21 +0000 Subject: [PATCH 12/16] change valid_cpu_backends order --- fastdeploy/vision/classification/ppcls/model.cc | 2 +- fastdeploy/vision/matting/contrib/rvm.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 fastdeploy/vision/classification/ppcls/model.cc diff --git a/fastdeploy/vision/classification/ppcls/model.cc b/fastdeploy/vision/classification/ppcls/model.cc old mode 100644 new mode 100755 index 935336a26f..ee87e88824 --- a/fastdeploy/vision/classification/ppcls/model.cc +++ b/fastdeploy/vision/classification/ppcls/model.cc @@ -24,7 +24,7 @@ PaddleClasModel::PaddleClasModel(const std::string& model_file, const RuntimeOption& custom_option, const ModelFormat& model_format) : preprocessor_(config_file) { if (model_format == ModelFormat::PADDLE) { - valid_cpu_backends = {Backend::ORT, Backend::OPENVINO, Backend::PDINFER, + valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::ORT, Backend::LITE}; valid_gpu_backends = {Backend::ORT, Backend::PDINFER, Backend::TRT}; valid_timvx_backends = {Backend::LITE}; diff --git a/fastdeploy/vision/matting/contrib/rvm.cc b/fastdeploy/vision/matting/contrib/rvm.cc index ea37402b04..258205cf89 100755 --- a/fastdeploy/vision/matting/contrib/rvm.cc +++ b/fastdeploy/vision/matting/contrib/rvm.cc @@ -28,7 +28,7 @@ RobustVideoMatting::RobustVideoMatting(const std::string& model_file, const RuntimeOption& custom_option, const ModelFormat& model_format) { if (model_format == ModelFormat::ONNX) { - valid_cpu_backends = {Backend::OPENVINO, Backend::ORT}; + valid_cpu_backends = {Backend::ORT, Backend::OPENVINO}; valid_gpu_backends = {Backend::ORT, Backend::TRT}; } else { valid_cpu_backends = {Backend::PDINFER, Backend::ORT}; From 43c9567746d803499320393a616570a7acef5620 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Thu, 1 Dec 2022 13:16:43 +0000 Subject: [PATCH 13/16] add ppocr benchmark --- benchmark/benchmark_ppocr.py | 279 +++++++++++++++++++++++++++++++ benchmark/run_benchmark_ppocr.sh | 23 +++ 2 files changed, 302 insertions(+) create mode 100644 benchmark/benchmark_ppocr.py create mode 100644 benchmark/run_benchmark_ppocr.sh diff --git a/benchmark/benchmark_ppocr.py b/benchmark/benchmark_ppocr.py new file mode 100644 index 0000000000..edcbb34d92 --- /dev/null +++ b/benchmark/benchmark_ppocr.py @@ -0,0 +1,279 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastdeploy as fd +import cv2 +import os +import numpy as np +import time + + +def parse_arguments(): + import argparse + import ast + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_dir", required=True, help="Model dir of PPOCR.") + parser.add_argument( + "--det_model", required=True, help="Path of Detection model of PPOCR.") + parser.add_argument( + "--cls_model", + required=True, + help="Path of Classification model of PPOCR.") + parser.add_argument( + "--rec_model", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--rec_label_file", + required=True, + help="Path of Recognization model of PPOCR.") + parser.add_argument( + "--image", type=str, required=False, help="Path of test image file.") + parser.add_argument( + "--cpu_num_thread", + type=int, + default=8, + help="default number of cpu thread.") + parser.add_argument( + "--device_id", type=int, default=0, help="device(gpu) id") + parser.add_argument( + "--iter_num", + required=True, + type=int, + default=300, + help="number of iterations for computing performace.") + parser.add_argument( + "--device", + default="cpu", + help="Type of inference device, support 'cpu' or 'gpu'.") + parser.add_argument( + "--backend", + type=str, + default="default", + help="inference backend, default, ort, ov, trt, paddle, paddle_trt.") + parser.add_argument( + "--enable_trt_fp16", + type=ast.literal_eval, + default=False, + help="whether enable fp16 in trt backend") + parser.add_argument( + "--enable_collect_memory_info", + type=ast.literal_eval, + default=False, + help="whether enable collect memory info") + args = parser.parse_args() + return args + + +def build_option(args): + option = fd.RuntimeOption() + device = args.device + backend = args.backend + enable_trt_fp16 = args.enable_trt_fp16 + option.set_cpu_thread_num(args.cpu_num_thread) + if device == "gpu": + option.use_gpu() + if backend == "ort": + option.use_ort_backend() + elif backend == "paddle": + option.use_paddle_backend() + elif backend in ["trt", "paddle_trt"]: + option.use_trt_backend() + if backend == "paddle_trt": + option.enable_paddle_to_trt() + if enable_trt_fp16: + option.enable_trt_fp16() + elif backend == "default": + return option + else: + raise Exception( + "While inference with GPU, only support default/ort/paddle/trt/paddle_trt now, {} is not supported.". + format(backend)) + elif device == "cpu": + if backend == "ort": + option.use_ort_backend() + elif backend == "ov": + option.use_openvino_backend() + elif backend == "paddle": + option.use_paddle_backend() + elif backend == "default": + return option + else: + raise Exception( + "While inference with CPU, only support default/ort/ov/paddle now, {} is not supported.". + format(backend)) + else: + raise Exception( + "Only support device CPU/GPU now, {} is not supported.".format( + device)) + + return option + + +def get_current_memory_mb(gpu_id=None): + import pynvml + import psutil + pid = os.getpid() + p = psutil.Process(pid) + info = p.memory_full_info() + cpu_mem = info.uss / 1024. / 1024. + gpu_mem = 0 + if gpu_id is not None: + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) + gpu_mem = meminfo.used / 1024. / 1024. + return cpu_mem, gpu_mem + + +def get_current_gputil(gpu_id): + import GPUtil + GPUs = GPUtil.getGPUs() + gpu_load = GPUs[gpu_id].load + return gpu_load + + +if __name__ == '__main__': + + args = parse_arguments() + option = build_option(args) + # Detection Model + det_model_file = os.path.join(args.model_dir, args.det_model, + "inference.pdmodel") + det_params_file = os.path.join(args.model_dir, args.det_model, + "inference.pdiparams") + # Classification Model + cls_model_file = os.path.join(args.model_dir, args.cls_model, + "inference.pdmodel") + cls_params_file = os.path.join(args.model_dir, args.cls_model, + "inference.pdiparams") + # Recognition Model + rec_model_file = os.path.join(args.model_dir, args.rec_model, + "inference.pdmodel") + rec_params_file = os.path.join(args.model_dir, args.rec_model, + "inference.pdiparams") + rec_label_file = os.path.join(args.model_dir, args.rec_label_file) + + gpu_id = args.device_id + enable_collect_memory_info = args.enable_collect_memory_info + end2end_statis = list() + cpu_mem = list() + gpu_mem = list() + gpu_util = list() + if args.device == "cpu": + file_path = args.model_dir + "_model_" + args.backend + "_" + \ + args.device + "_" + str(args.cpu_num_thread) + ".txt" + else: + if args.enable_trt_fp16: + file_path = args.model_dir + "_model_" + args.backend + "_fp16_" + args.device + ".txt" + else: + file_path = args.model_dir + "_model_" + args.backend + "_" + args.device + ".txt" + f = open(file_path, "w") + f.writelines("===={}====: \n".format(os.path.split(file_path)[-1][:-4])) + + try: + det_option = option + cls_option = option + rec_option = option + if "OCRv2" in args.model_dir: + if args.backend in ["trt", "paddle_trt"]: + det_option.set_trt_input_shape( + "x", [1, 3, 64, 64], [1, 3, 640, 640], [1, 3, 960, 960]) + cls_option.set_trt_input_shape( + "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024]) + rec_option.set_trt_input_shape( + "x", [1, 3, 32, 10], [10, 3, 32, 320], [64, 3, 32, 2304]) + det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + rec_model = fd.vision.ocr.Recognizer( + rec_model_file, + rec_params_file, + rec_label_file, + runtime_option=rec_option) + model = fd.vision.ocr.PPOCRv2( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + elif "OCRv3" in args.model_dir: + if args.backend in ["trt", "paddle_trt"]: + det_option.set_trt_input_shape( + "x", [1, 3, 64, 64], [1, 3, 640, 640], [1, 3, 960, 960]) + cls_option.set_trt_input_shape( + "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024]) + rec_option.set_trt_input_shape( + "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 2304]) + det_model = fd.vision.ocr.DBDetector( + det_model_file, det_params_file, runtime_option=det_option) + cls_model = fd.vision.ocr.Classifier( + cls_model_file, cls_params_file, runtime_option=cls_option) + rec_model = fd.vision.ocr.Recognizer( + rec_model_file, + rec_params_file, + rec_label_file, + runtime_option=rec_option) + model = fd.vision.ocr.PPOCRv3( + det_model=det_model, cls_model=cls_model, rec_model=rec_model) + else: + raise Exception("model {} not support now in ppocr series".format( + args.model_dir)) + det_model.enable_record_time_of_runtime() + cls_model.enable_record_time_of_runtime() + rec_model.enable_record_time_of_runtime() + im_ori = cv2.imread(args.image) + for i in range(args.iter_num): + im = im_ori + start = time.time() + result = model.predict(im) + end2end_statis.append(time.time() - start) + if enable_collect_memory_info: + gpu_util.append(get_current_gputil(gpu_id)) + cm, gm = get_current_memory_mb(gpu_id) + cpu_mem.append(cm) + gpu_mem.append(gm) + + runtime_statis_det = det_model.print_statis_info_of_runtime() + runtime_statis_cls = cls_model.print_statis_info_of_runtime() + runtime_statis_rec = rec_model.print_statis_info_of_runtime() + + warmup_iter = args.iter_num // 5 + end2end_statis_repeat = end2end_statis[warmup_iter:] + if enable_collect_memory_info: + cpu_mem_repeat = cpu_mem[warmup_iter:] + gpu_mem_repeat = gpu_mem[warmup_iter:] + gpu_util_repeat = gpu_util[warmup_iter:] + + dump_result = dict() + dump_result["runtime"] = ( + runtime_statis_det["avg_time"] + runtime_statis_cls["avg_time"] + + runtime_statis_rec["avg_time"]) * 1000 + dump_result["end2end"] = np.mean(end2end_statis_repeat) * 1000 + if enable_collect_memory_info: + dump_result["cpu_rss_mb"] = np.mean(cpu_mem_repeat) + dump_result["gpu_rss_mb"] = np.mean(gpu_mem_repeat) + dump_result["gpu_util"] = np.mean(gpu_util_repeat) + + f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"]))) + f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"]))) + if enable_collect_memory_info: + f.writelines("cpu_rss_mb: {} \n".format( + str(dump_result["cpu_rss_mb"]))) + f.writelines("gpu_rss_mb: {} \n".format( + str(dump_result["gpu_rss_mb"]))) + f.writelines("gpu_util: {} \n".format( + str(dump_result["gpu_util"]))) + except: + f.writelines("!!!!!Infer Failed\n") + + f.close() diff --git a/benchmark/run_benchmark_ppocr.sh b/benchmark/run_benchmark_ppocr.sh new file mode 100644 index 0000000000..c9f24afd7e --- /dev/null +++ b/benchmark/run_benchmark_ppocr.sh @@ -0,0 +1,23 @@ +echo "[FastDeploy] Running PPOCR benchmark..." + +# for PPOCRv2 +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ort --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend paddle --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ov --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend ort --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_trt_fp16 True --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv2 --det_model ch_PP-OCRv2_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv2_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_trt_fp16 True --enable_collect_memory_info True + +# for PPOCRv3 +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ort --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend paddle --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --cpu_num_thread 8 --iter_num 2000 --backend ov --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend ort --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend paddle_trt --enable_trt_fp16 True --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_collect_memory_info True +python benchmark_ppocr.py --model_dir ch_PP-OCRv3 --det_model ch_PP-OCRv3_det_infer --cls_model ch_ppocr_mobile_v2.0_cls_infer --rec_model ch_PP-OCRv3_rec_infer --rec_label_file ppocr_keys_v1.txt --image 12.jpg --device gpu --iter_num 2000 --backend trt --enable_trt_fp16 True --enable_collect_memory_info True From ccfe6ff752a6ce53bdb08055ca4aaa5c7ea43681 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Fri, 2 Dec 2022 03:01:38 +0000 Subject: [PATCH 14/16] mv bs from 64 to 32 --- benchmark/benchmark_ppocr.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/benchmark/benchmark_ppocr.py b/benchmark/benchmark_ppocr.py index edcbb34d92..5b88622438 100644 --- a/benchmark/benchmark_ppocr.py +++ b/benchmark/benchmark_ppocr.py @@ -184,21 +184,24 @@ def get_current_gputil(gpu_id): f.writelines("===={}====: \n".format(os.path.split(file_path)[-1][:-4])) try: - det_option = option - cls_option = option rec_option = option if "OCRv2" in args.model_dir: + det_option = option if args.backend in ["trt", "paddle_trt"]: det_option.set_trt_input_shape( "x", [1, 3, 64, 64], [1, 3, 640, 640], [1, 3, 960, 960]) - cls_option.set_trt_input_shape( - "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024]) - rec_option.set_trt_input_shape( - "x", [1, 3, 32, 10], [10, 3, 32, 320], [64, 3, 32, 2304]) det_model = fd.vision.ocr.DBDetector( det_model_file, det_params_file, runtime_option=det_option) + cls_option = option + if args.backend in ["trt", "paddle_trt"]: + cls_option.set_trt_input_shape( + "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024]) cls_model = fd.vision.ocr.Classifier( cls_model_file, cls_params_file, runtime_option=cls_option) + rec_option = option + if args.backend in ["trt", "paddle_trt"]: + rec_option.set_trt_input_shape( + "x", [1, 3, 32, 10], [10, 3, 32, 320], [32, 3, 32, 2304]) rec_model = fd.vision.ocr.Recognizer( rec_model_file, rec_params_file, @@ -210,14 +213,16 @@ def get_current_gputil(gpu_id): if args.backend in ["trt", "paddle_trt"]: det_option.set_trt_input_shape( "x", [1, 3, 64, 64], [1, 3, 640, 640], [1, 3, 960, 960]) - cls_option.set_trt_input_shape( - "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024]) - rec_option.set_trt_input_shape( - "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 2304]) det_model = fd.vision.ocr.DBDetector( det_model_file, det_params_file, runtime_option=det_option) + if args.backend in ["trt", "paddle_trt"]: + cls_option.set_trt_input_shape( + "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 1024]) cls_model = fd.vision.ocr.Classifier( cls_model_file, cls_params_file, runtime_option=cls_option) + if args.backend in ["trt", "paddle_trt"]: + rec_option.set_trt_input_shape( + "x", [1, 3, 48, 10], [10, 3, 48, 320], [64, 3, 48, 2304]) rec_model = fd.vision.ocr.Recognizer( rec_model_file, rec_params_file, From 910bc4a998da1b9ea08e16bf0a9a09c7c51d5193 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Mon, 5 Dec 2022 07:33:46 +0000 Subject: [PATCH 15/16] fixed quantize.md --- docs/cn/quantize.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) mode change 100644 => 100755 docs/cn/quantize.md diff --git a/docs/cn/quantize.md b/docs/cn/quantize.md old mode 100644 new mode 100755 index 57f5837d8a..6bd1785bc9 --- a/docs/cn/quantize.md +++ b/docs/cn/quantize.md @@ -63,7 +63,7 @@ Benchmark表格说明: | [YOLOv7](../../examples/vision/detection/yolov7/quantize/) | Paddle Inference | CPU | 995.85 | 477.93|None|None | 2.08 |51.1 | 46.2|量化蒸馏训练 | #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | TensorRT | GPU | 24.61 | 21.20 | 20.78 | 20.94 | 1.18 | 37.6 | 36.7 | 量化蒸馏训练 | | [YOLOv5s](../../examples/vision/detection/yolov5/quantize/) | Paddle-TensorRT | GPU | 23.53 | None | 21.98 | 19.84 | 1.28 | 37.6 | 36.8 | 量化蒸馏训练 | @@ -94,7 +94,7 @@ Benchmark表格说明: | [MobileNetV1_ssld](../../examples/vision/classification/paddleclas/quantize/) | Paddle Inference | CPU | 12.29 | 4.68 | None|None|2.62 |77.89 | 71.36 |离线量化 | #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | TensorRT | GPU | 4.92| 2.28|2.24|2.23 | 2.21 | 79.12 | 79.06 | 离线量化 | | [ResNet50_vd](../../examples/vision/classification/paddleclas/quantize/) | Paddle-TensorRT | GPU | 4.48|None |2.09|2.10 | 2.14 | 79.12 | 79.06 | 离线量化 | @@ -119,7 +119,7 @@ NOTE: - TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子 #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | TensorRT | GPU | 35.75 | 15.42 |20.70|20.85 | 2.32 | 51.4 | 50.7 | 量化蒸馏训练 | | [ppyoloe_crn_l_300e_coco](../../examples/vision/detection/paddledetection/quantize ) | Paddle-TensorRT | GPU | 33.48 |None | 18.47 |18.03 | 1.81 | 51.4 | 50.5| 量化蒸馏训练 | @@ -134,6 +134,6 @@ NOTE: | [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize) | Paddle Inference | CPU | 1138.04| 602.62 |None|None | 1.89 |77.37 | 71.62 |量化蒸馏训练 | #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [PP-LiteSeg-T(STDC1)-cityscapes](../../examples/vision/segmentation/paddleseg/quantize) | Paddle Inference | CPU | 4726.65| 4134.91|None|None | 1.14 |77.37 | 71.62 |量化蒸馏训练 | From 8d8362b10fd633915cc84c12a0a75bcdce7ce622 Mon Sep 17 00:00:00 2001 From: wjj19950828 Date: Mon, 5 Dec 2022 07:48:25 +0000 Subject: [PATCH 16/16] fixed quantize bugs --- docs/cn/quantize.md | 2 +- examples/vision/classification/paddleclas/quantize/README.md | 4 ++-- examples/vision/detection/paddledetection/quantize/README.md | 4 ++-- examples/vision/detection/yolov5/quantize/README.md | 4 ++-- examples/vision/detection/yolov6/quantize/README.md | 4 ++-- examples/vision/detection/yolov7/quantize/README.md | 4 ++-- examples/vision/segmentation/paddleseg/quantize/README.md | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/cn/quantize.md b/docs/cn/quantize.md index 6bd1785bc9..26a75ec4eb 100755 --- a/docs/cn/quantize.md +++ b/docs/cn/quantize.md @@ -36,7 +36,7 @@ FastDeploy基于PaddleSlim的Auto Compression Toolkit(ACT), 给用户提供了 目前, FastDeploy支持自动化压缩,并完成部署测试的模型的Runtime Benchmark和端到端Benchmark如下所示. Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 diff --git a/examples/vision/classification/paddleclas/quantize/README.md b/examples/vision/classification/paddleclas/quantize/README.md index 6e3f78b4d5..0a814e0e37 100644 --- a/examples/vision/classification/paddleclas/quantize/README.md +++ b/examples/vision/classification/paddleclas/quantize/README.md @@ -11,7 +11,7 @@ FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输 用户也可以直接下载下表中的量化模型进行部署. Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 @@ -33,7 +33,7 @@ Benchmark表格说明: | [MobileNetV1_ssld](https://bj.bcebos.com/paddlehub/fastdeploy/mobilenetv1_ssld_ptq.tar) | Paddle Inference | CPU | 12.29 | 4.68 | None|None|2.62 |77.89 | 71.36 |离线量化 | ### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 Top1 | INT8 Top1 | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | TensorRT | GPU | 4.92| 2.28|2.24|2.23 | 2.21 | 79.12 | 79.06 | 离线量化 | | [ResNet50_vd](https://bj.bcebos.com/paddlehub/fastdeploy/resnet50_vd_ptq.tar) | Paddle-TensorRT | GPU | 4.48|None |2.09|2.10 | 2.14 | 79.12 | 79.06 | 离线量化 | diff --git a/examples/vision/detection/paddledetection/quantize/README.md b/examples/vision/detection/paddledetection/quantize/README.md index 8c6f1feeef..b041b34684 100644 --- a/examples/vision/detection/paddledetection/quantize/README.md +++ b/examples/vision/detection/paddledetection/quantize/README.md @@ -11,7 +11,7 @@ FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输 Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 @@ -32,7 +32,7 @@ NOTE: - TensorRT比Paddle-TensorRT快的原因是在runtime移除了multiclass_nms3算子 #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | TensorRT | GPU | 35.75 | 15.42 |20.70|20.85 | 2.32 | 51.4 | 50.7 | 量化蒸馏训练 | | [ppyoloe_crn_l_300e_coco](https://bj.bcebos.com/paddlehub/fastdeploy/ppyoloe_crn_l_300e_coco_qat.tar ) | Paddle-TensorRT | GPU | 33.48 |None | 18.47 |18.03 | 1.81 | 51.4 | 50.5 | 量化蒸馏训练 | diff --git a/examples/vision/detection/yolov5/quantize/README.md b/examples/vision/detection/yolov5/quantize/README.md index 853718381f..20b628d9b9 100644 --- a/examples/vision/detection/yolov5/quantize/README.md +++ b/examples/vision/detection/yolov5/quantize/README.md @@ -10,7 +10,7 @@ FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输 用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 @@ -29,7 +29,7 @@ Benchmark表格说明: | [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle Inference| CPU | 213.73 | 130.19 | None | None | 1.64 |37.6 | 35.2 | 量化蒸馏训练 | #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | TensorRT | GPU | 24.61 | 21.20 | 20.78 | 20.94 | 1.18 | 37.6 | 36.7 | 量化蒸馏训练 | | [YOLOv5s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_quant.tar) | Paddle-TensorRT | GPU | 23.53 | None | 21.98 | 19.84 | 1.28 | 37.6 | 36.8 | 量化蒸馏训练 | diff --git a/examples/vision/detection/yolov6/quantize/README.md b/examples/vision/detection/yolov6/quantize/README.md index 04af3f6896..bceb33afb8 100644 --- a/examples/vision/detection/yolov6/quantize/README.md +++ b/examples/vision/detection/yolov6/quantize/README.md @@ -9,7 +9,7 @@ FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输 用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 @@ -28,7 +28,7 @@ Benchmark表格说明: #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | TensorRT | GPU | 15.66 | 11.30 | 10.25 |9.59 | 1.63 | 42.5 | 40.7|量化蒸馏训练 | | [YOLOv6s](https://bj.bcebos.com/paddlehub/fastdeploy/yolov6s_ptq_model.tar) | Paddle-TensorRT | GPU | 15.03 | None| 11.36 | 9.32 | 1.61 | 42.5 | 40.7|量化蒸馏训练 | diff --git a/examples/vision/detection/yolov7/quantize/README.md b/examples/vision/detection/yolov7/quantize/README.md index 5795325680..9c5261c342 100644 --- a/examples/vision/detection/yolov7/quantize/README.md +++ b/examples/vision/detection/yolov7/quantize/README.md @@ -11,7 +11,7 @@ FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输 Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 @@ -29,7 +29,7 @@ Benchmark表格说明: | [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle Inference | CPU | 995.85 | 477.93|None|None | 2.08 |51.1 | 46.2|量化蒸馏训练 | #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mAP | INT8 mAP | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | TensorRT | GPU | 36.47 | 18.81 | 20.33| 17.58| 2.07 | 51.1| 50.4|量化蒸馏训练 | | [YOLOv7](https://bj.bcebos.com/paddlehub/fastdeploy/yolov7_quant.tar) | Paddle-TensorRT | GPU | 37.06|None|20.26|17.53 | 2.11 | 51.1| 50.4|量化蒸馏训练 | diff --git a/examples/vision/segmentation/paddleseg/quantize/README.md b/examples/vision/segmentation/paddleseg/quantize/README.md index 6199c653ac..add706d22f 100644 --- a/examples/vision/segmentation/paddleseg/quantize/README.md +++ b/examples/vision/segmentation/paddleseg/quantize/README.md @@ -11,7 +11,7 @@ FastDeploy 提供了一键模型自动化压缩工具, 能够简单地通过输 用户也可以直接下载下表中的量化模型进行部署.(点击模型名字即可下载) Benchmark表格说明: -- Rtuntime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. +- Runtime时延为模型在各种Runtime上的推理时延,包含CPU->GPU数据拷贝,GPU推理,GPU->CPU数据拷贝时间. 不包含模型各自的前后处理时间. - 端到端时延为模型在实际推理场景中的时延, 包含模型的前后处理. - 所测时延均为推理1000次后求得的平均值, 单位是毫秒. - INT8 + FP16 为在推理INT8量化模型的同时, 给Runtime 开启FP16推理选项 @@ -26,7 +26,7 @@ Benchmark表格说明: | [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar)) | Paddle Inference | CPU | 1138.04| 602.62 |None|None | 1.89 |77.37 | 71.62 |量化蒸馏训练 | #### 端到端 Benchmark -| 模型 |推理后端 |部署硬件 | FP32 Runtime时延 | INT8 Runtime时延 | INT8 + FP16 Runtime时延 | INT8+FP16+PM Runtime时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | +| 模型 |推理后端 |部署硬件 | FP32 End2End时延 | INT8 End2End时延 | INT8 + FP16 End2End时延 | INT8+FP16+PM End2End时延 | 最大加速比 | FP32 mIoU | INT8 mIoU | 量化方式 | | ------------------- | -----------------|-----------| -------- |-------- |-------- | --------- |-------- |----- |----- |----- | | [PP-LiteSeg-T(STDC1)-cityscapes](https://bj.bcebos.com/paddlehub/fastdeploy/PP_LiteSeg_T_STDC1_cityscapes_without_argmax_infer_QAT_new.tar)) | Paddle Inference | CPU | 4726.65| 4134.91|None|None | 1.14 |77.37 | 71.62 |量化蒸馏训练 |