PaddlePaddle · joey12300 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022
diff --git a/examples/multimodal/stable_diffusion/cpp/README.md b/examples/multimodal/stable_diffusion/cpp/README.md
@@ -0,0 +1,12 @@
+# StableDiffusion C++部署示例
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+本目录下提供`*_infer.cc`快速完成StableDiffusion各任务的C++部署示例。
+
+## Inpaint任务
+
+StableDiffusion Inpaint任务是一个根据提示文本补全图片的任务，具体而言就是用户给定提示文本，原始图片以及原始图片的mask图片，该任务输出补全后的图片。
diff --git a/examples/multimodal/stable_diffusion/cpp/main.cc b/examples/multimodal/stable_diffusion/cpp/main.cc
@@ -12,16 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "dpm_solver_multistep_scheduler.h"
-#include "fastdeploy/vision/common/processors/mat.h"
+#include "./dpm_solver_multistep_scheduler.h"
+#include "./pipeline_stable_diffusion_inpaint.h"
 #include "fastdeploy/utils/perf.h"
+#include "fastdeploy/vision/common/processors/mat.h"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
-#include "pipeline_stable_diffusion_inpaint.h"
 #include <iostream>
 #include <memory>
 #include <sstream>
 #include <string>
+#include <unordered_map>
+
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
 
 template <typename T> std::string Str(const T* value, int size) {
   std::ostringstream oss;
@@ -33,17 +40,40 @@ template <typename T> std::string Str(const T* value, int size) {
   return oss.str();
 }
 
-std::unique_ptr<fastdeploy::Runtime>
-CreateRuntime(const std::string& model_file, const std::string& params_file,
-              bool use_paddle_backend = true) {
+std::unique_ptr<fastdeploy::Runtime> CreateRuntime(
+    const std::string& model_file, const std::string& params_file,
+    bool use_trt_backend = false, bool use_fp16 = false,
+    const std::unordered_map<std::string, std::vector<std::vector<int>>>&
+        dynamic_shapes = {},
+    const std::vector<std::string>& disable_paddle_trt_ops = {}) {
   fastdeploy::RuntimeOption runtime_option;
   runtime_option.SetModelPath(model_file, params_file,
                               fastdeploy::ModelFormat::PADDLE);
   runtime_option.UseGpu();
-  if (use_paddle_backend) {
+  if (!use_trt_backend) {
     runtime_option.UsePaddleBackend();
   } else {
-    runtime_option.UseOrtBackend();
+    runtime_option.UseTrtBackend();
+    runtime_option.EnablePaddleToTrt();
+    for (auto it = dynamic_shapes.begin(); it != dynamic_shapes.end(); ++it) {
+      if (it->second.size() != 3) {
+        std::cerr << "The size of dynamic_shapes of input `" << it->first
+                  << "` should be 3, but receive " << it->second.size()
+                  << std::endl;
+        continue;
+      }
+      std::vector<int> min_shape = (it->second)[0];
+      std::vector<int> opt_shape = (it->second)[1];
+      std::vector<int> max_shape = (it->second)[2];
+      runtime_option.SetTrtInputShape(it->first, min_shape, opt_shape,
+                                      max_shape);
+    }
+    runtime_option.SetTrtCacheFile("paddle.trt");
+    runtime_option.EnablePaddleTrtCollectShape();
+    runtime_option.DisablePaddleTrtOPs(disable_paddle_trt_ops);
+    if (use_fp16) {
+      runtime_option.EnableTrtFP16();
+    }
   }
   std::unique_ptr<fastdeploy::Runtime> runtime =
       std::unique_ptr<fastdeploy::Runtime>(new fastdeploy::Runtime());
@@ -59,6 +89,20 @@ CreateRuntime(const std::string& model_file, const std::string& params_file,
 }
 
 int main() {
+  // 0. Init all configs
+  std::string model_dir = "sd15_inpaint";
+  int max_length = 77;
+  bool use_trt_backend = true;
+  bool use_fp16 = true;
+  int batch_size = 1;
+  int num_images_per_prompt = 1;
+  int num_inference_steps = 50;
+
+  int height = 512;
+  int width = 512;
+  constexpr int unet_inpaint_channels = 9;
+  constexpr int latents_channels = 4;
+
   // 1. Init scheduler
   std::unique_ptr<fastdeploy::Scheduler> dpm(
       new fastdeploy::DPMSolverMultistepScheduler(
@@ -77,54 +121,116 @@ int main() {
           /* lower_order_final = */ true));
 
   // 2. Init text encoder runtime
-  std::string text_model_file = "sd15_inpaint/text_encoder/inference.pdmodel";
-  std::string text_params_file =
-      "sd15_inpaint/text_encoder/inference.pdiparams";
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      text_dynamic_shape = {{"input_ids",
+                             {/* min_shape */ {1, max_length},
+                              /* opt_shape */ {batch_size, max_length},
+                              /* max_shape */ {2 * batch_size, max_length}}}};
+  std::string text_model_dir = model_dir + sep + "text_encoder";
+  std::string text_model_file = text_model_dir + sep + "inference.pdmodel";
+  std::string text_params_file = text_model_dir + sep + "inference.pdiparams";
   std::unique_ptr<fastdeploy::Runtime> text_encoder_runtime =
-      CreateRuntime(text_model_file, text_params_file, false);
+      CreateRuntime(text_model_file, text_params_file, use_trt_backend,
+                    use_fp16, text_dynamic_shape);
 
   // 3. Init vae encoder runtime
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      vae_encoder_dynamic_shape = {
+          {"sample",
+           {/* min_shape */ {1, 3, height, width},
+            /* opt_shape */ {2 * batch_size, 3, height, width},
+            /* max_shape */ {2 * batch_size, 3, height, width}}}};
+  std::string vae_encoder_model_dir = model_dir + sep + "vae_encoder";
   std::string vae_encoder_model_file =
-      "sd15_inpaint/vae_encoder/inference.pdmodel";
+      vae_encoder_model_dir + sep + "inference.pdmodel";
   std::string vae_encoder_params_file =
-      "sd15_inpaint/vae_encoder/inference.pdiparams";
+      vae_encoder_model_dir + sep + "inference.pdiparams";
   std::unique_ptr<fastdeploy::Runtime> vae_encoder_runtime =
-      CreateRuntime(vae_encoder_model_file, vae_encoder_params_file);
+      CreateRuntime(vae_encoder_model_file, vae_encoder_params_file,
+                    use_trt_backend, use_fp16, vae_encoder_dynamic_shape);
 
   // 4. Init vae decoder runtime
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      vae_decoder_dynamic_shape = {
+          {"latent_sample",
+           {/* min_shape */ {1, latents_channels, height / 8, width / 8},
+            /* opt_shape */
+            {2 * batch_size, latents_channels, height / 8, width / 8},
+            /* max_shape */
+            {2 * batch_size, latents_channels, height / 8, width / 8}}}};
+  std::string vae_decoder_model_dir = model_dir + sep + "vae_decoder";
   std::string vae_decoder_model_file =
-      "sd15_inpaint/vae_decoder/inference.pdmodel";
+      vae_decoder_model_dir + sep + "inference.pdmodel";
   std::string vae_decoder_params_file =
-      "sd15_inpaint/vae_decoder/inference.pdiparams";
+      vae_decoder_model_dir + sep + "inference.pdiparams";
   std::unique_ptr<fastdeploy::Runtime> vae_decoder_runtime =
-      CreateRuntime(vae_decoder_model_file, vae_decoder_params_file);
+      CreateRuntime(vae_decoder_model_file, vae_decoder_params_file,
+                    use_trt_backend, use_fp16, vae_decoder_dynamic_shape);
 
   // 5. Init unet runtime
-  std::string unet_model_file = "sd15_inpaint/unet/inference.pdmodel";
-  std::string unet_params_file = "sd15_inpaint/unet/inference.pdiparams";
+  std::unordered_map<std::string, std::vector<std::vector<int>>>
+      unet_dynamic_shape = {
+          {"sample",
+           {/* min_shape */ {1, unet_inpaint_channels, height / 8, width / 8},
+            /* opt_shape */
+            {2 * batch_size, unet_inpaint_channels, height / 8, width / 8},
+            /* max_shape */
+            {2 * batch_size, unet_inpaint_channels, height / 8, width / 8}}},
+          {"timesteps", {{1}, {1}, {1}}},
+          {"encoder_hidden_states",
+           {{1, max_length, 768},
+            {2 * batch_size, max_length, 768},
+            {2 * batch_size, max_length, 768}}}};
+  std::vector<std::string> unet_disable_paddle_trt_ops = {"sin", "cos"};
+  std::string unet_model_dir = model_dir + sep + "unet";
+  std::string unet_model_file = unet_model_dir + sep + "inference.pdmodel";
+  std::string unet_params_file = unet_model_dir + sep + "inference.pdiparams";
   std::unique_ptr<fastdeploy::Runtime> unet_runtime =
-      CreateRuntime(unet_model_file, unet_params_file);
+      CreateRuntime(unet_model_file, unet_params_file, use_trt_backend,
+                    use_fp16, unet_dynamic_shape, unet_disable_paddle_trt_ops);
 
   // 6. Init fast tokenizer
   paddlenlp::fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer(
-      "clip/vocab.json", "clip/merges.txt", /* max_length = */ 77);
+      "clip/vocab.json", "clip/merges.txt", /* max_length = */ max_length);
   fastdeploy::StableDiffusionInpaintPipeline pipe(
-      std::move(vae_encoder_runtime), std::move(vae_decoder_runtime),
-      std::move(text_encoder_runtime), std::move(unet_runtime),
-      /* scheduler = */ std::move(dpm), tokenizer);
+      /* vae_encoder = */ std::move(vae_encoder_runtime),
+      /* vae_decoder = */ std::move(vae_decoder_runtime),
+      /* text_encoder = */ std::move(text_encoder_runtime),
+      /* unet = */ std::move(unet_runtime),
+      /* scheduler = */ std::move(dpm),
+      /* tokenizer = */ tokenizer);
 
   // 7. Read images
   auto image = cv::imread("overture-creations.png");
   auto mask_image = cv::imread("overture-creations-mask.png");
 
   // 8. Predict
+  /*
+   * One may need to pass the initial noise to predict api.
+   * There's an example:
+   * std::vector<float> latents_data = {xxxx};
+   * fastdeploy::FDTensor latents;
+   * latents.SetExternalData({batch_size * num_images_per_prompt, latents_channels, height / 8, width / 8},fastdeploy::FDDataType::FP32, latents_data.data());
+   * pipe.Predict(..., /* latents = *\/ &latents, ....);
+   */
   std::vector<std::string> prompts = {
       "Face of a yellow cat, high resolution, sitting on a park bench"};
   std::vector<fastdeploy::FDTensor> outputs;
   fastdeploy::TimeCounter tc;
   tc.Start();
-  pipe.Predict(prompts, image, mask_image, &outputs, /* height = */ 512,
-               /* width = */ 512, /* num_inference_steps = */ 50);
+  pipe.Predict(prompts, image, mask_image, &outputs,
+               /* height = */ height,
+               /* width = */ width,
+               /* num_inference_steps = */ num_inference_steps,
+               /* guidance_scale = */ 7.5,
+               /* negative_prompt = */ {},
+               /* num_images_per_prompt = */ num_images_per_prompt,
+               /* eta = */ 1.0,
+               /* max_length = */ max_length,
+               /* latents = */ nullptr,
+               /* output_cv_mat = */ true,
+               /* callback = */ nullptr,
+               /* callback_steps = */ 1);
   tc.End();
   tc.PrintInfo();
   fastdeploy::vision::FDMat mat = fastdeploy::vision::FDMat::Create(outputs[0]);

diff --git a/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc b/examples/multimodal/stable_diffusion/cpp/pipeline_stable_diffusion_inpaint.cc
@@ -49,7 +49,8 @@ void StableDiffusionInpaintPipeline::PrepareMaskAndMaskedImage(
       float_mask[i] = 1;
     }
   }
-  image_mask.SetExternalData({1, 1, shape[1] * 8, shape[0] * 8},
+  // NCHW format
+  image_mask.SetExternalData({1, 1, shape[0] * 8, shape[1] * 8},
                              FDDataType::FP32, float_mask.data());
 
   // Set mask_image
@@ -314,9 +315,6 @@ void StableDiffusionInpaintPipeline::Predict(
     vision::FDMat mask_fdmat_t = vision::FDMat::Create((*output_images)[i]);
     vision::RGB2BGR::Run(&mask_fdmat_t, vision::ProcLib::OPENCV);
     mask_fdmat_t.CopyToTensor(&(*output_images)[i]);
-    FDTensor sum;
-    function::Sum((*output_images)[i], &sum, {}, false, true);
-    FDINFO << "sum = " << ((float*)sum.Data())[0] << std::endl;
   }
 }
 }  // namespace fastdeploy