From b66bc302fda13f9baebee21da81b164e9600d8c8 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Wed, 22 Feb 2023 12:03:08 +0000 Subject: [PATCH 01/17] Add pipeline_fastdeploy_cycle_diffusion.py --- ppdiffusers/ppdiffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_fastdeploy_cycle_diffusion.py | 684 ++++++++++++++++++ 3 files changed, 686 insertions(+) create mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py index 4cba94ff8388..e0a4873c1506 100644 --- a/ppdiffusers/ppdiffusers/pipelines/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/__init__.py @@ -94,6 +94,7 @@ from ..utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403 else: from .stable_diffusion import ( + FastDeployCycleDiffusionPipeline, FastDeployStableDiffusionImg2ImgPipeline, FastDeployStableDiffusionInpaintPipeline, FastDeployStableDiffusionInpaintPipelineLegacy, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py index 4b0bdca2c3fa..f76037228930 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py @@ -92,6 +92,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline if is_paddlenlp_available() and is_fastdeploy_available(): + from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline from .pipeline_fastdeploy_stable_diffusion_img2img import ( FastDeployStableDiffusionImg2ImgPipeline, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py new file mode 100644 index 000000000000..cdcd446d3ef9 --- /dev/null +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -0,0 +1,684 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Callable, List, Optional, Union + +import numpy as np +import paddle +import PIL + +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTokenizer + +from ...configuration_utils import FrozenDict +from ...fastdeploy_utils import FastDeployRuntimeModel +from ...pipeline_utils import DiffusionPipeline +from ...schedulers import DDIMScheduler +from ...utils import PIL_INTERPOLATION, deprecate, logging +from . import StableDiffusionPipelineOutput + +logger = logging.get_logger(__name__) + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess +def preprocess(image): + if isinstance(image, paddle.Tensor): + return image + elif isinstance(image, PIL.Image.Image): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + w, h = image[0].size + w, h = map(lambda x: x - x % 8, (w, h)) # resize to integer multiple of 8 + + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = image.transpose(0, 3, 1, 2) + image = 2.0 * image - 1.0 + image = paddle.to_tensor(image) + elif isinstance(image[0], paddle.Tensor): + image = paddle.concat(image, axis=0) + return image + + +def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta): + # 1. get previous step value (=t-1) + prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps + + if prev_timestep <= 0: + return clean_latents + + # 2. compute alphas, betas + alpha_prod_t = scheduler.alphas_cumprod[timestep] + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod + ) + + variance = scheduler._get_variance(timestep, prev_timestep) + std_dev_t = eta * variance ** (0.5) + + # direction pointing to x_t + e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5) + dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t + noise = std_dev_t * paddle.randn(clean_latents.shape, dtype=clean_latents.dtype, generator=generator) + prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise + + return prev_latents + + +def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): + # 1. get previous step value (=t-1) + prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = scheduler.alphas_cumprod[timestep] + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod + ) + + beta_prod_t = 1 - alpha_prod_t + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) + + # 4. Clip "predicted x_0" + if scheduler.config.clip_sample: + pred_original_sample = pred_original_sample.clip(-1, 1) + + # 5. compute variance: "sigma_t(η)" -> see formula (16) + # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) + variance = scheduler._get_variance(timestep, prev_timestep) + std_dev_t = eta * variance ** (0.5) + + # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred + + noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / ( + variance ** (0.5) * eta + ) + return noise + + +class FastDeployCycleDiffusionPipeline(DiffusionPipeline): + r""" + Pipeline for text-guided image to image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular xxxx, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + def __init__( + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: DDIMScheduler, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. PaddleNLP team, diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + vae_encoder=vae_encoder, + vae_decoder=vae_decoder, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[Union[paddle.Tensor, np.ndarray]] = None, + negative_prompt_embeds: Optional[Union[paddle.Tensor, np.ndarray]] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids.numpy() + + prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int64)) + prompt_embeds = paddle.to_tensor(prompt_embeds[0]) + + prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="np", + ) + + negative_prompt_embeds = self.text_encoder( + input_ids=uncond_input.input_ids.astype(np.int64), + ) + negative_prompt_embeds = paddle.to_tensor(negative_prompt_embeds[0]) + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs + def check_inputs( + self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + ): + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / 0.18215 * latents + image = self.vae.decode(latents).sample + image = (image / 2 + 0.5).clip(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 + image = image.transpose([0, 2, 3, 1]).cast("float32").numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): + image = image.cast(dtype) + + batch_size = image.shape[0] + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = paddle.concat(init_latents, axis=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) + init_latents = 0.18215 * init_latents + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + deprecation_message = ( + f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many initial images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = paddle.concat([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0) + + # add noise to latents using the timestep + shape = init_latents.shape + if isinstance(generator, list): + shape = [ + 1, + ] + shape[1:] + noise = [paddle.randn(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)] + noise = paddle.concat(noise, axis=0) + else: + noise = paddle.randn(shape, generator=generator, dtype=dtype) + + # get latents + clean_latents = init_latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + return latents, clean_latents + + def __call__( + self, + prompt: Union[str, List[str]], + source_prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[paddle.Tensor, np.ndarray]] = None, + source_guidance_scale: Optional[float] = 1, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[Union[paddle.Tensor, np.ndarray]] = None, + negative_prompt_embeds: Optional[Union[paddle.Tensor, np.ndarray]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The target prompt or prompts to guide the image generation. + source_prompt (`str` or `List[str]`): + The source prompt or prompts describe the input image. + image (`paddle.Tensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The + number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added + noise will be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. This parameter will be modulated by `strength`. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The negative prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + source_guidance_scale (`float`, *optional*, defaults to 1): + Guidance scale for the source prompt. This is useful to control the amount of influence the source + prompt for encoding. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.1): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`paddle.Generator`, *optional*): + One or a list of paddle generator(s) to make generation deterministic. + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 1. Check inputs + self.check_inputs(prompt, strength, callback_steps) + + # 2. Define call parameters + batch_size = 1 if isinstance(prompt, str) else len(prompt) + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode target prompt and source prompt + prompt_embeds = self._encode_prompt( + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + source_prompt_embeds = self._encode_prompt( + source_prompt, num_images_per_prompt, do_classifier_free_guidance, None + ) + + # 4. Preprocess image + image = preprocess(image) + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) + + # 6. Prepare latent variables + latents, clean_latents = self.prepare_latents( + image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, generator + ) + source_latents = latents + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + generator = extra_step_kwargs.pop("generator", None) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + + unet_output_name = self.unet.model.get_output_info(0).name + unet_input_names = [self.unet.model.get_input_info(i).name for i in range(self.unet.model.num_inputs())] + height, width = image[-2:] + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + concat_noise_pred = paddle.zeros( + [4 * batch_size * num_images_per_prompt, 4, height // 8, width // 8], dtype="float32" + ) + # expand the latents if we are doing classifier free guidance + latent_model_input = paddle.concat([latents] * 2) + source_latent_model_input = paddle.concat([source_latents] * 2) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t) + + # predict the noise residual + concat_latent_model_input = paddle.stack( + [ + source_latent_model_input[0], + latent_model_input[0], + source_latent_model_input[1], + latent_model_input[1], + ], + axis=0, + ) + concat_prompt_embeds = paddle.stack( + [ + source_prompt_embeds[0], + prompt_embeds[0], + source_prompt_embeds[1], + prompt_embeds[1], + ], + axis=0, + ) + + # predict the noise residual + self.unet.zero_copy_infer( + prebinded_inputs={ + unet_input_names[0]: concat_latent_model_input, + unet_input_names[1]: t, + unet_input_names[2]: concat_prompt_embeds, + }, + prebinded_outputs={unet_output_name: concat_noise_pred}, + share_with_raw_ptr=True, + ) + + # perform guidance + ( + source_noise_pred_uncond, + noise_pred_uncond, + source_noise_pred_text, + noise_pred_text, + ) = concat_noise_pred.chunk(4, axis=0) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + source_noise_pred = source_noise_pred_uncond + source_guidance_scale * ( + source_noise_pred_text - source_noise_pred_uncond + ) + + # Sample source_latents from the posterior distribution. + prev_source_latents = posterior_sample( + self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs + ) + # Compute noise. + noise = compute_noise( + self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs + ) + source_latents = prev_source_latents + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs + ).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # 9. Post-processing + image = self.decode_latents(latents) + + # 10. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) + + # 11. Convert to PIL + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From 94a411338c855140d5406d17c0750a13be9d8c20 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Wed, 22 Feb 2023 12:57:21 +0000 Subject: [PATCH 02/17] Add cycle diffusion example --- .../deploy/text_guided_img_to_img_infer.py | 349 ++++++++++++++++++ ppdiffusers/ppdiffusers/__init__.py | 1 + .../pipeline_fastdeploy_cycle_diffusion.py | 21 +- 3 files changed, 366 insertions(+), 5 deletions(-) create mode 100644 ppdiffusers/deploy/text_guided_img_to_img_infer.py diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py new file mode 100644 index 000000000000..2af83fae9d83 --- /dev/null +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -0,0 +1,349 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from io import BytesIO + +import fastdeploy as fd +import paddle +import requests +from fastdeploy import ModelFormat +from PIL import Image + +from paddlenlp.trainer.argparser import strtobool +from paddlenlp.transformers import CLIPTokenizer +from ppdiffusers import ( + DDIMScheduler, + FastDeployCycleDiffusionPipeline, + FastDeployRuntimeModel, +) + + +def parse_arguments(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_dir", default="paddle_diffusion_model", help="The model directory of diffusion_model." + ) + parser.add_argument("--model_format", default="paddle", choices=["paddle", "onnx"], help="The model format.") + parser.add_argument("--unet_model_prefix", default="unet", help="The file prefix of unet model.") + parser.add_argument( + "--vae_decoder_model_prefix", default="vae_decoder", help="The file prefix of vae decoder model." + ) + parser.add_argument( + "--vae_encoder_model_prefix", default="vae_encoder", help="The file prefix of vae encoder model." + ) + parser.add_argument( + "--text_encoder_model_prefix", default="text_encoder", help="The file prefix of text_encoder model." + ) + parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.") + parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.") + parser.add_argument( + "--backend", + type=str, + default="paddle", + # Note(zhoushunjie): Will support 'tensorrt', 'paddle-tensorrt' soon. + choices=["onnx_runtime", "paddle", "paddle-tensorrt", "tensorrt", "paddlelite"], + help="The inference runtime backend of unet model and text encoder model.", + ) + parser.add_argument( + "--device", + type=str, + default="gpu", + # Note(shentanyue): Will support more devices. + choices=[ + "cpu", + "gpu", + "huawei_ascend_npu", + "kunlunxin_xpu", + ], + help="The inference runtime device of models.", + ) + parser.add_argument("--use_fp16", type=strtobool, default=False, help="Wheter to use FP16 mode") + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + return parser.parse_args() + + +def create_ort_runtime(model_dir, model_prefix, model_format, device_id=0): + option = fd.RuntimeOption() + option.use_ort_backend() + option.use_gpu(device_id) + if model_format == "paddle": + model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel") + params_file = os.path.join(model_dir, model_prefix, "inference.pdiparams") + option.set_model_path(model_file, params_file) + else: + onnx_file = os.path.join(model_dir, model_prefix, "inference.onnx") + option.set_model_path(onnx_file, model_format=ModelFormat.ONNX) + return fd.Runtime(option) + + +def create_paddle_inference_runtime( + model_dir, model_prefix, use_trt=False, dynamic_shape=None, use_fp16=False, device_id=0 +): + option = fd.RuntimeOption() + option.use_paddle_backend() + if device_id == -1: + option.use_cpu() + else: + option.use_gpu(device_id) + if use_trt: + option.use_trt_backend() + option.enable_paddle_to_trt() + if use_fp16: + option.enable_trt_fp16() + cache_file = os.path.join(model_dir, model_prefix, "inference.trt") + option.set_trt_cache_file(cache_file) + # Need to enable collect shape for ernie + if dynamic_shape is not None: + option.enable_paddle_trt_collect_shape() + for key, shape_dict in dynamic_shape.items(): + option.set_trt_input_shape( + key, + min_shape=shape_dict["min_shape"], + opt_shape=shape_dict.get("opt_shape", None), + max_shape=shape_dict.get("max_shape", None), + ) + model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel") + params_file = os.path.join(model_dir, model_prefix, "inference.pdiparams") + option.set_model_path(model_file, params_file) + return fd.Runtime(option) + + +def create_paddle_lite_runtime(model_dir, model_prefix, device="cpu", device_id=0): + option = fd.RuntimeOption() + option.use_lite_backend() + if device == "huawei_ascend_npu": + option.use_ascend() + option.set_lite_device_names(["huawei_ascend_npu"]) + option.set_lite_model_cache_dir(os.path.join(model_dir, model_prefix)) + option.set_lite_context_properties( + "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format( + device_id + ) + ) + elif device == "kunlunxin_xpu": + # TODO(shentanyue): Add kunlunxin_xpu code + pass + else: + pass + model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel") + params_file = os.path.join(model_dir, model_prefix, "inference.pdiparams") + option.set_model_path(model_file, params_file) + return fd.Runtime(option) + + +def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31), dynamic_shape=None, device_id=0): + option = fd.RuntimeOption() + option.use_trt_backend() + option.use_gpu(device_id) + option.enable_trt_fp16() + option.set_trt_max_workspace_size(workspace) + if dynamic_shape is not None: + for key, shape_dict in dynamic_shape.items(): + option.set_trt_input_shape( + key, + min_shape=shape_dict["min_shape"], + opt_shape=shape_dict.get("opt_shape", None), + max_shape=shape_dict.get("max_shape", None), + ) + if model_format == "paddle": + model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel") + params_file = os.path.join(model_dir, model_prefix, "inference.pdiparams") + option.set_model_path(model_file, params_file) + else: + onnx_file = os.path.join(model_dir, model_prefix, "inference.onnx") + option.set_model_path(onnx_file, model_format=ModelFormat.ONNX) + cache_file = os.path.join(model_dir, model_prefix, "inference.trt") + option.set_trt_cache_file(cache_file) + return fd.Runtime(option) + + +if __name__ == "__main__": + args = parse_arguments() + # 0. Init device id + device_id = args.device_id + if args.device == "cpu": + device_id = -1 + paddle.set_device("cpu") + else: + paddle.set_device(f"gpu:{device_id}") + + # 1. Init scheduler + scheduler = DDIMScheduler.from_pretrained(os.path.join(args.model_dir, "scheduler")) + + # 2. Init tokenizer + tokenizer = CLIPTokenizer.from_pretrained(os.path.join(args.model_dir, "tokenizer")) + + # 3. Set dynamic shape for trt backend + vae_decoder_dynamic_shape = { + "latent_sample": { + "min_shape": [1, 4, 64, 64], + "max_shape": [2, 4, 64, 64], + "opt_shape": [2, 4, 64, 64], + } + } + + vae_encoder_dynamic_shape = { + "sample": { + "min_shape": [1, 3, 512, 512], + "max_shape": [2, 3, 512, 512], + "opt_shape": [2, 3, 512, 512], + } + } + + unet_dynamic_shape = { + "sample": { + "min_shape": [1, 4, 64, 64], + "max_shape": [2, 4, 64, 64], + "opt_shape": [2, 4, 64, 64], + }, + "timestep": { + "min_shape": [1], + "max_shape": [1], + "opt_shape": [1], + }, + "encoder_hidden_states": { + "min_shape": [1, 77, 768], + "max_shape": [2, 77, 768], + "opt_shape": [2, 77, 768], + }, + } + # 4. Init runtime + if args.backend == "onnx_runtime": + text_encoder_runtime = create_ort_runtime( + args.model_dir, args.text_encoder_model_prefix, args.model_format, device_id=device_id + ) + vae_decoder_runtime = create_ort_runtime( + args.model_dir, args.vae_decoder_model_prefix, args.model_format, device_id=device_id + ) + vae_encoder_runtime = create_ort_runtime( + args.model_dir, args.vae_encoder_model_prefix, args.model_format, device_id=device_id + ) + start = time.time() + unet_runtime = create_ort_runtime( + args.model_dir, args.unet_model_prefix, args.model_format, device_id=device_id + ) + print(f"Spend {time.time() - start : .2f} s to load unet model.") + elif args.backend == "paddle" or args.backend == "paddle-tensorrt": + use_trt = True if args.backend == "paddle-tensorrt" else False + # Note(zhoushunjie): Will change to paddle runtime later + text_encoder_runtime = create_ort_runtime( + args.model_dir, args.text_encoder_model_prefix, args.model_format, device_id=device_id + ) + vae_decoder_runtime = create_paddle_inference_runtime( + args.model_dir, + args.vae_decoder_model_prefix, + use_trt, + vae_decoder_dynamic_shape, + use_fp16=args.use_fp16, + device_id=device_id, + ) + vae_encoder_runtime = create_paddle_inference_runtime( + args.model_dir, + args.vae_encoder_model_prefix, + use_trt, + vae_encoder_dynamic_shape, + use_fp16=args.use_fp16, + device_id=device_id, + ) + start = time.time() + unet_runtime = create_paddle_inference_runtime( + args.model_dir, + args.unet_model_prefix, + use_trt, + unet_dynamic_shape, + use_fp16=args.use_fp16, + device_id=device_id, + ) + print(f"Spend {time.time() - start : .2f} s to load unet model.") + elif args.backend == "tensorrt": + text_encoder_runtime = create_ort_runtime(args.model_dir, args.text_encoder_model_prefix, args.model_format) + vae_decoder_runtime = create_trt_runtime( + args.model_dir, + args.vae_decoder_model_prefix, + args.model_format, + workspace=(1 << 30), + dynamic_shape=vae_decoder_dynamic_shape, + device_id=device_id, + ) + vae_encoder_runtime = create_trt_runtime( + args.model_dir, + args.vae_encoder_model_prefix, + args.model_format, + workspace=(1 << 30), + dynamic_shape=vae_encoder_dynamic_shape, + device_id=device_id, + ) + start = time.time() + unet_runtime = create_trt_runtime( + args.model_dir, + args.unet_model_prefix, + args.model_format, + dynamic_shape=unet_dynamic_shape, + device_id=device_id, + ) + print(f"Spend {time.time() - start : .2f} s to load unet model.") + elif args.backend == "paddlelite": + text_encoder_runtime = create_paddle_lite_runtime( + args.model_dir, args.text_encoder_model_prefix, device=args.device, device_id=device_id + ) + vae_decoder_runtime = create_paddle_lite_runtime( + args.model_dir, args.vae_decoder_model_prefix, device=args.device, device_id=device_id + ) + vae_encoder_runtime = create_paddle_lite_runtime( + args.model_dir, args.vae_encoder_model_prefix, device=args.device, device_id=device_id + ) + start = time.time() + unet_runtime = create_paddle_lite_runtime( + args.model_dir, args.unet_model_prefix, device=args.device, device_id=device_id + ) + print(f"Spend {time.time() - start : .2f} s to load unet model.") + + pipe = FastDeployCycleDiffusionPipeline( + vae_encoder=FastDeployRuntimeModel(model=vae_encoder_runtime), + vae_decoder=FastDeployRuntimeModel(model=vae_decoder_runtime), + text_encoder=FastDeployRuntimeModel(model=text_encoder_runtime), + tokenizer=tokenizer, + unet=FastDeployRuntimeModel(model=unet_runtime), + scheduler=scheduler, + safety_checker=None, + feature_extractor=None, + ) + + # 5. Download an initial image + url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png" + response = requests.get(url) + init_image = Image.open(BytesIO(response.content)).convert("RGB") + init_image = init_image.resize((512, 512)) + init_image.save("horse.png") + + # 6. Specify a prompt + source_prompt = "An astronaut riding a horse" + prompt = "An astronaut riding an elephant" + + # 7. Call the pipeline + image = pipe( + prompt=prompt, + source_prompt=source_prompt, + image=init_image, + num_inference_steps=100, + eta=0.1, + strength=0.8, + guidance_scale=2, + source_guidance_scale=1, + ).images[0] + image.save("horse_to_elephant.png") diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py index b4656561df15..91d9e27acbb1 100644 --- a/ppdiffusers/ppdiffusers/__init__.py +++ b/ppdiffusers/ppdiffusers/__init__.py @@ -147,6 +147,7 @@ from .utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403 else: from .pipelines import ( + FastDeployCycleDiffusionPipeline, FastDeployStableDiffusionImg2ImgPipeline, FastDeployStableDiffusionInpaintPipeline, FastDeployStableDiffusionInpaintPipelineLegacy, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index cdcd446d3ef9..c2c647b5a3fd 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -388,11 +388,22 @@ def run_safety_checker(self, image, dtype): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): latents = 1 / 0.18215 * latents - image = self.vae.decode(latents).sample - image = (image / 2 + 0.5).clip(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 - image = image.transpose([0, 2, 3, 1]).cast("float32").numpy() - return image + latents_shape = latents.shape + vae_output_shape = [latents_shape[0], 3, latents_shape[2] * 8, latents_shape[3] * 8] + images_vae = paddle.zeros(vae_output_shape, dtype="float32") + + vae_input_name = self.vae_decoder.model.get_input_info(0).name + vae_output_name = self.vae_decoder.model.get_output_info(0).name + + self.vae_decoder.zero_copy_infer( + prebinded_inputs={vae_input_name: latents}, + prebinded_outputs={vae_output_name: images_vae}, + share_with_raw_ptr=True, + ) + + images_vae = paddle.clip(images_vae / 2 + 0.5, 0, 1) + images = images_vae.transpose([0, 2, 3, 1]) + return images.numpy() # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): From b5199054edb399c635fd6a4418b18993f97dc2e2 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Wed, 22 Feb 2023 13:40:13 +0000 Subject: [PATCH 03/17] remove cast --- .../deploy/text_guided_img_to_img_infer.py | 68 +++++++++---------- .../pipeline_fastdeploy_cycle_diffusion.py | 3 - 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index 2af83fae9d83..0b305e8f89c0 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -92,7 +92,15 @@ def create_ort_runtime(model_dir, model_prefix, model_format, device_id=0): def create_paddle_inference_runtime( - model_dir, model_prefix, use_trt=False, dynamic_shape=None, use_fp16=False, device_id=0 + model_dir, + model_prefix, + use_trt=False, + dynamic_shape=None, + use_fp16=False, + device_id=0, + disable_paddle_trt_ops=[], + disable_paddle_pass=[], + paddle_stream=None, ): option = fd.RuntimeOption() option.use_paddle_backend() @@ -100,11 +108,15 @@ def create_paddle_inference_runtime( option.use_cpu() else: option.use_gpu(device_id) + if paddle_stream is not None: + option.set_external_raw_stream(paddle_stream) + for pass_name in disable_paddle_pass: + option.paddle_infer_option.delete_pass(pass_name) if use_trt: - option.use_trt_backend() - option.enable_paddle_to_trt() + option.paddle_infer_option.disable_trt_ops(disable_paddle_trt_ops) + option.paddle_infer_option.enable_trt = True if use_fp16: - option.enable_trt_fp16() + option.trt_option.enable_fp16 = True cache_file = os.path.join(model_dir, model_prefix, "inference.trt") option.set_trt_cache_file(cache_file) # Need to enable collect shape for ernie @@ -181,6 +193,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 paddle.set_device("cpu") else: paddle.set_device(f"gpu:{device_id}") + paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream # 1. Init scheduler scheduler = DDIMScheduler.from_pretrained(os.path.join(args.model_dir, "scheduler")) @@ -196,15 +209,13 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 "opt_shape": [2, 4, 64, 64], } } - - vae_encoder_dynamic_shape = { - "sample": { - "min_shape": [1, 3, 512, 512], - "max_shape": [2, 3, 512, 512], - "opt_shape": [2, 3, 512, 512], + text_encoder_shape = { + "input_ids": { + "min_shape": [1, 77], + "max_shape": [2, 77], + "opt_shape": [1, 77], } } - unet_dynamic_shape = { "sample": { "min_shape": [1, 4, 64, 64], @@ -230,9 +241,6 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 vae_decoder_runtime = create_ort_runtime( args.model_dir, args.vae_decoder_model_prefix, args.model_format, device_id=device_id ) - vae_encoder_runtime = create_ort_runtime( - args.model_dir, args.vae_encoder_model_prefix, args.model_format, device_id=device_id - ) start = time.time() unet_runtime = create_ort_runtime( args.model_dir, args.unet_model_prefix, args.model_format, device_id=device_id @@ -241,24 +249,24 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 elif args.backend == "paddle" or args.backend == "paddle-tensorrt": use_trt = True if args.backend == "paddle-tensorrt" else False # Note(zhoushunjie): Will change to paddle runtime later - text_encoder_runtime = create_ort_runtime( - args.model_dir, args.text_encoder_model_prefix, args.model_format, device_id=device_id - ) - vae_decoder_runtime = create_paddle_inference_runtime( + text_encoder_runtime = create_paddle_inference_runtime( args.model_dir, - args.vae_decoder_model_prefix, + args.text_encoder_model_prefix, use_trt, - vae_decoder_dynamic_shape, + text_encoder_shape, use_fp16=args.use_fp16, device_id=device_id, + disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"], + paddle_stream=paddle_stream, ) - vae_encoder_runtime = create_paddle_inference_runtime( + vae_decoder_runtime = create_paddle_inference_runtime( args.model_dir, - args.vae_encoder_model_prefix, + args.vae_decoder_model_prefix, use_trt, - vae_encoder_dynamic_shape, + vae_decoder_dynamic_shape, use_fp16=args.use_fp16, device_id=device_id, + paddle_stream=paddle_stream, ) start = time.time() unet_runtime = create_paddle_inference_runtime( @@ -268,6 +276,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 unet_dynamic_shape, use_fp16=args.use_fp16, device_id=device_id, + paddle_stream=paddle_stream, ) print(f"Spend {time.time() - start : .2f} s to load unet model.") elif args.backend == "tensorrt": @@ -280,14 +289,6 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 dynamic_shape=vae_decoder_dynamic_shape, device_id=device_id, ) - vae_encoder_runtime = create_trt_runtime( - args.model_dir, - args.vae_encoder_model_prefix, - args.model_format, - workspace=(1 << 30), - dynamic_shape=vae_encoder_dynamic_shape, - device_id=device_id, - ) start = time.time() unet_runtime = create_trt_runtime( args.model_dir, @@ -304,9 +305,6 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 vae_decoder_runtime = create_paddle_lite_runtime( args.model_dir, args.vae_decoder_model_prefix, device=args.device, device_id=device_id ) - vae_encoder_runtime = create_paddle_lite_runtime( - args.model_dir, args.vae_encoder_model_prefix, device=args.device, device_id=device_id - ) start = time.time() unet_runtime = create_paddle_lite_runtime( args.model_dir, args.unet_model_prefix, device=args.device, device_id=device_id @@ -314,7 +312,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 print(f"Spend {time.time() - start : .2f} s to load unet model.") pipe = FastDeployCycleDiffusionPipeline( - vae_encoder=FastDeployRuntimeModel(model=vae_encoder_runtime), + vae_encoder=None, vae_decoder=FastDeployRuntimeModel(model=vae_decoder_runtime), text_encoder=FastDeployRuntimeModel(model=text_encoder_runtime), tokenizer=tokenizer, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index c2c647b5a3fd..a9c34049076d 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -259,8 +259,6 @@ def _encode_prompt( prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int64)) prompt_embeds = paddle.to_tensor(prompt_embeds[0]) - prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) - bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) @@ -304,7 +302,6 @@ def _encode_prompt( if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) From e58785542f6e7fa644297d04d4f0fe953315686c Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Wed, 22 Feb 2023 15:04:20 +0000 Subject: [PATCH 04/17] fix vae encoder --- .../deploy/text_guided_img_to_img_infer.py | 40 ++++++++++++++++--- .../pipeline_fastdeploy_cycle_diffusion.py | 10 ++--- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index 0b305e8f89c0..cf7c0e3ec2b3 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -209,6 +209,13 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 "opt_shape": [2, 4, 64, 64], } } + vae_encoder_dynamic_shape = { + "sample": { + "min_shape": [1, 3, 512, 512], + "max_shape": [2, 3, 512, 512], + "opt_shape": [2, 3, 512, 512], + } + } text_encoder_shape = { "input_ids": { "min_shape": [1, 77], @@ -219,8 +226,8 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 unet_dynamic_shape = { "sample": { "min_shape": [1, 4, 64, 64], - "max_shape": [2, 4, 64, 64], - "opt_shape": [2, 4, 64, 64], + "max_shape": [4, 4, 64, 64], + "opt_shape": [4, 4, 64, 64], }, "timestep": { "min_shape": [1], @@ -229,8 +236,8 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 }, "encoder_hidden_states": { "min_shape": [1, 77, 768], - "max_shape": [2, 77, 768], - "opt_shape": [2, 77, 768], + "max_shape": [4, 77, 768], + "opt_shape": [4, 77, 768], }, } # 4. Init runtime @@ -241,6 +248,9 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 vae_decoder_runtime = create_ort_runtime( args.model_dir, args.vae_decoder_model_prefix, args.model_format, device_id=device_id ) + vae_encoder_runtime = create_ort_runtime( + args.model_dir, args.vae_encoder_model_prefix, args.model_format, device_id=device_id + ) start = time.time() unet_runtime = create_ort_runtime( args.model_dir, args.unet_model_prefix, args.model_format, device_id=device_id @@ -268,6 +278,15 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 device_id=device_id, paddle_stream=paddle_stream, ) + vae_encoder_runtime = create_paddle_inference_runtime( + args.model_dir, + args.vae_encoder_model_prefix, + use_trt, + vae_encoder_dynamic_shape, + use_fp16=args.use_fp16, + device_id=device_id, + paddle_stream=paddle_stream, + ) start = time.time() unet_runtime = create_paddle_inference_runtime( args.model_dir, @@ -289,6 +308,14 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 dynamic_shape=vae_decoder_dynamic_shape, device_id=device_id, ) + vae_encoder_runtime = create_trt_runtime( + args.model_dir, + args.vae_encoder_model_prefix, + args.model_format, + workspace=(1 << 30), + dynamic_shape=vae_encoder_dynamic_shape, + device_id=device_id, + ) start = time.time() unet_runtime = create_trt_runtime( args.model_dir, @@ -305,6 +332,9 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 vae_decoder_runtime = create_paddle_lite_runtime( args.model_dir, args.vae_decoder_model_prefix, device=args.device, device_id=device_id ) + vae_encoder_runtime = create_paddle_lite_runtime( + args.model_dir, args.vae_encoder_model_prefix, device=args.device, device_id=device_id + ) start = time.time() unet_runtime = create_paddle_lite_runtime( args.model_dir, args.unet_model_prefix, device=args.device, device_id=device_id @@ -312,7 +342,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 print(f"Spend {time.time() - start : .2f} s to load unet model.") pipe = FastDeployCycleDiffusionPipeline( - vae_encoder=None, + vae_encoder=FastDeployRuntimeModel(model=vae_encoder_runtime), vae_decoder=FastDeployRuntimeModel(model=vae_decoder_runtime), text_encoder=FastDeployRuntimeModel(model=text_encoder_runtime), tokenizer=tokenizer, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index a9c34049076d..d61cf1659c69 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -422,14 +422,10 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt f" size of {batch_size}. Make sure the batch size matches the length of the generators." ) - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = paddle.concat(init_latents, axis=0) - else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) + image = image.astype(dtype) + init_latents = self.vae_encoder(sample=image)[0] init_latents = 0.18215 * init_latents + init_latents = paddle.to_tensor(init_latents) if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size From 3d80acc3b719d0ce406fd7b635a145796063a5d1 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Thu, 23 Feb 2023 10:02:25 +0000 Subject: [PATCH 05/17] Fix pipeline bug --- ppdiffusers/deploy/text_guided_img_to_img_infer.py | 2 +- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index cf7c0e3ec2b3..bcd60b08ca79 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -196,7 +196,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream # 1. Init scheduler - scheduler = DDIMScheduler.from_pretrained(os.path.join(args.model_dir, "scheduler")) + scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") # 2. Init tokenizer tokenizer = CLIPTokenizer.from_pretrained(os.path.join(args.model_dir, "tokenizer")) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index d61cf1659c69..c77346cd37b1 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -596,7 +596,7 @@ def __call__( unet_output_name = self.unet.model.get_output_info(0).name unet_input_names = [self.unet.model.get_input_info(i).name for i in range(self.unet.model.num_inputs())] - height, width = image[-2:] + height, width = image.shape[-2:] with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): concat_noise_pred = paddle.zeros( From 323ff318e499c6b3e4a1df2c24547fc8ae1c566e Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Thu, 23 Feb 2023 11:25:33 +0000 Subject: [PATCH 06/17] Add none paddle_stream --- ppdiffusers/deploy/text_to_img_infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ppdiffusers/deploy/text_to_img_infer.py b/ppdiffusers/deploy/text_to_img_infer.py index 429ba1a79f43..4151219e7f6f 100644 --- a/ppdiffusers/deploy/text_to_img_infer.py +++ b/ppdiffusers/deploy/text_to_img_infer.py @@ -214,6 +214,7 @@ def get_scheduler(args): if args.device == "cpu": device_id = -1 paddle.set_device("cpu") + paddle_stream = None else: paddle.set_device(f"gpu:{device_id}") paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream From a918be3579eb836405566f5d60006c02e90ced0f Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Thu, 23 Feb 2023 11:48:14 +0000 Subject: [PATCH 07/17] Add paddle_stream None --- ppdiffusers/deploy/text_guided_img_to_img_infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index bcd60b08ca79..481416cd1158 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -191,6 +191,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 if args.device == "cpu": device_id = -1 paddle.set_device("cpu") + paddle_stream = None else: paddle.set_device(f"gpu:{device_id}") paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream From 9684b4934948e9355cb70c395924630f578ddc6c Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Thu, 23 Feb 2023 15:00:51 +0000 Subject: [PATCH 08/17] Add synchronize --- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index c77346cd37b1..a36afef335b3 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -665,7 +665,9 @@ def __call__( latents = self.scheduler.step( noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs ).prev_sample - + if i == len(timesteps) - 1: + # sync for accuracy it/s measure + paddle.device.cuda.synchronize() # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() From 5b2f05358f4595a7421ae56ab5110d3cf61f67ec Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Sun, 26 Feb 2023 16:18:18 +0000 Subject: [PATCH 09/17] Fix cycle diffusion --- .../pipeline_fastdeploy_cycle_diffusion.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index a36afef335b3..85c631a000e9 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -594,8 +594,8 @@ def __call__( # 8. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - unet_output_name = self.unet.model.get_output_info(0).name - unet_input_names = [self.unet.model.get_input_info(i).name for i in range(self.unet.model.num_inputs())] + # unet_output_name = self.unet.model.get_output_info(0).name + # unet_input_names = [self.unet.model.get_input_info(i).name for i in range(self.unet.model.num_inputs())] height, width = image.shape[-2:] with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): @@ -629,15 +629,20 @@ def __call__( ) # predict the noise residual - self.unet.zero_copy_infer( - prebinded_inputs={ - unet_input_names[0]: concat_latent_model_input, - unet_input_names[1]: t, - unet_input_names[2]: concat_prompt_embeds, - }, - prebinded_outputs={unet_output_name: concat_noise_pred}, - share_with_raw_ptr=True, - ) + # TODO(zhoushunjie): Use zero copy infer in the future + # self.unet.zero_copy_infer( + # prebinded_inputs={ + # unet_input_names[0]: concat_latent_model_input, + # unet_input_names[1]: t, + # unet_input_names[2]: concat_prompt_embeds, + # }, + # prebinded_outputs={unet_output_name: concat_noise_pred}, + # share_with_raw_ptr=True, + # ) + concat_noise_pred = self.unet( + sample=concat_latent_model_input, timestep=t, encoder_hidden_states=concat_prompt_embeds + )[0] + concat_noise_pred = paddle.to_tensor(concat_noise_pred) # perform guidance ( From d19389d247b02d281da68ef0d4a460b130d19c51 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Mon, 27 Feb 2023 02:54:04 +0000 Subject: [PATCH 10/17] Add benchmark steps --- .../deploy/text_guided_img_to_img_infer.py | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index 481416cd1158..7f278f40a46b 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -17,6 +17,7 @@ from io import BytesIO import fastdeploy as fd +import numpy as np import paddle import requests from fastdeploy import ModelFormat @@ -49,8 +50,11 @@ def parse_arguments(): parser.add_argument( "--text_encoder_model_prefix", default="text_encoder", help="The file prefix of text_encoder model." ) - parser.add_argument("--inference_steps", type=int, default=50, help="The number of unet inference steps.") + parser.add_argument("--inference_steps", type=int, default=100, help="The number of unet inference steps.") parser.add_argument("--benchmark_steps", type=int, default=1, help="The number of performance benchmark steps.") + parser.add_argument( + "--image_path", default="horse_to_elephant.png", help="The model directory of diffusion_model." + ) parser.add_argument( "--backend", type=str, @@ -365,14 +369,39 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 prompt = "An astronaut riding an elephant" # 7. Call the pipeline - image = pipe( + # Warm up + pipe.scheduler.set_timesteps(10) + pipe( prompt=prompt, source_prompt=source_prompt, image=init_image, - num_inference_steps=100, + num_inference_steps=10, eta=0.1, strength=0.8, guidance_scale=2, source_guidance_scale=1, - ).images[0] - image.save("horse_to_elephant.png") + ) + time_costs = [] + print(f"Run the cycle diffusion pipeline {args.benchmark_steps} times to test the performance.") + pipe.scheduler.set_timesteps(args.inference_steps) + for step in range(args.benchmark_steps): + start = time.time() + image = pipe( + prompt=prompt, + source_prompt=source_prompt, + image=init_image, + num_inference_steps=args.inference_steps, + eta=0.1, + strength=0.8, + guidance_scale=2, + source_guidance_scale=1, + ).images[0] + latency = time.time() - start + time_costs += [latency] + print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + image.save(f"{args.image_path}") + print(f"Image saved in {args.image_path}!") From 0d404880e6bf04513de7ffbda3e63fcba5477b51 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Mon, 27 Feb 2023 05:37:23 +0000 Subject: [PATCH 11/17] Update cycle diffusion --- ppdiffusers/deploy/text_guided_img_to_img_infer.py | 2 -- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index 7f278f40a46b..870219656ba8 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -370,7 +370,6 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 # 7. Call the pipeline # Warm up - pipe.scheduler.set_timesteps(10) pipe( prompt=prompt, source_prompt=source_prompt, @@ -383,7 +382,6 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 ) time_costs = [] print(f"Run the cycle diffusion pipeline {args.benchmark_steps} times to test the performance.") - pipe.scheduler.set_timesteps(args.inference_steps) for step in range(args.benchmark_steps): start = time.time() image = pipe( diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index 85c631a000e9..eaa45e5e14ab 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -242,7 +242,7 @@ def _encode_prompt( return_tensors="pd", ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="np").input_ids if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( text_input_ids, untruncated_ids @@ -254,8 +254,6 @@ def _encode_prompt( "The following part of your input was truncated because CLIP can only handle sequences up to" f" {self.tokenizer.model_max_length} tokens: {removed_text}" ) - text_input_ids = text_input_ids.numpy() - prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int64)) prompt_embeds = paddle.to_tensor(prompt_embeds[0]) From 5255feed3c9f495d0e7fe1d293703b5bbd06f243 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Mon, 27 Feb 2023 05:50:38 +0000 Subject: [PATCH 12/17] pd->np --- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index eaa45e5e14ab..12b4b6a16aca 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -239,7 +239,7 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", + return_tensors="np", ) text_input_ids = text_inputs.input_ids untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="np").input_ids From db5d401ef4d1c7e7fceb7266649bb1ec4b68b815 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Mon, 27 Feb 2023 07:04:04 +0000 Subject: [PATCH 13/17] add numpy() --- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index 12b4b6a16aca..ae9ed7443e96 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -638,7 +638,9 @@ def __call__( # share_with_raw_ptr=True, # ) concat_noise_pred = self.unet( - sample=concat_latent_model_input, timestep=t, encoder_hidden_states=concat_prompt_embeds + sample=concat_latent_model_input.numpy(), + timestep=t.numpy(), + encoder_hidden_states=concat_prompt_embeds.numpy(), )[0] concat_noise_pred = paddle.to_tensor(concat_noise_pred) From f2960c36bb3e3d6c854832043e4b1ab80a9a3789 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Mon, 27 Feb 2023 23:54:17 +0000 Subject: [PATCH 14/17] use_trt=False --- ppdiffusers/deploy/text_guided_img_to_img_infer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index 870219656ba8..e4eb04faeb06 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -263,11 +263,10 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 print(f"Spend {time.time() - start : .2f} s to load unet model.") elif args.backend == "paddle" or args.backend == "paddle-tensorrt": use_trt = True if args.backend == "paddle-tensorrt" else False - # Note(zhoushunjie): Will change to paddle runtime later text_encoder_runtime = create_paddle_inference_runtime( args.model_dir, args.text_encoder_model_prefix, - use_trt, + False, # use_trt text_encoder_shape, use_fp16=args.use_fp16, device_id=device_id, From 6feb2b53e2d02fab262c33a1bf34dcc188a0fde5 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 28 Feb 2023 11:01:37 +0000 Subject: [PATCH 15/17] Cast to float32 --- ppdiffusers/deploy/text_guided_img_to_img_infer.py | 2 +- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index e4eb04faeb06..730234326b5a 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -266,7 +266,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 text_encoder_runtime = create_paddle_inference_runtime( args.model_dir, args.text_encoder_model_prefix, - False, # use_trt + use_trt, text_encoder_shape, use_fp16=args.use_fp16, device_id=device_id, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index ae9ed7443e96..1680b962a4d4 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -639,7 +639,7 @@ def __call__( # ) concat_noise_pred = self.unet( sample=concat_latent_model_input.numpy(), - timestep=t.numpy(), + timestep=t.cast("float32").numpy(), encoder_hidden_states=concat_prompt_embeds.numpy(), )[0] concat_noise_pred = paddle.to_tensor(concat_noise_pred) From c8c1474dc59aada1f96cb3d78de560e2a69d7b1d Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 28 Feb 2023 11:52:23 +0000 Subject: [PATCH 16/17] np -> pdtensor --- .../stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index 1680b962a4d4..bbaa4f1d4954 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -245,7 +245,7 @@ def _encode_prompt( untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="np").input_ids if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( - text_input_ids, untruncated_ids + paddle.to_tensor(text_input_ids), paddle.to_tensor(untruncated_ids) ): removed_text = self.tokenizer.batch_decode( untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] From c33ed9a26927b8b08bde13f4c0c2c783571fc432 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 28 Feb 2023 12:33:04 +0000 Subject: [PATCH 17/17] Update new api --- .../deploy/text_guided_img_to_img_infer.py | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/ppdiffusers/deploy/text_guided_img_to_img_infer.py b/ppdiffusers/deploy/text_guided_img_to_img_infer.py index 730234326b5a..a2fa8cea6c3b 100644 --- a/ppdiffusers/deploy/text_guided_img_to_img_infer.py +++ b/ppdiffusers/deploy/text_guided_img_to_img_infer.py @@ -122,17 +122,15 @@ def create_paddle_inference_runtime( if use_fp16: option.trt_option.enable_fp16 = True cache_file = os.path.join(model_dir, model_prefix, "inference.trt") - option.set_trt_cache_file(cache_file) + option.trt_option.serialize_file = cache_file # Need to enable collect shape for ernie if dynamic_shape is not None: - option.enable_paddle_trt_collect_shape() + option.paddle_infer_option.collect_trt_shape = True for key, shape_dict in dynamic_shape.items(): - option.set_trt_input_shape( - key, - min_shape=shape_dict["min_shape"], - opt_shape=shape_dict.get("opt_shape", None), - max_shape=shape_dict.get("max_shape", None), + option.trt_option.set_shape( + key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), shape_dict.get("max_shape", None) ) + model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel") params_file = os.path.join(model_dir, model_prefix, "inference.pdiparams") option.set_model_path(model_file, params_file) @@ -144,9 +142,8 @@ def create_paddle_lite_runtime(model_dir, model_prefix, device="cpu", device_id= option.use_lite_backend() if device == "huawei_ascend_npu": option.use_ascend() - option.set_lite_device_names(["huawei_ascend_npu"]) - option.set_lite_model_cache_dir(os.path.join(model_dir, model_prefix)) - option.set_lite_context_properties( + option.paddle_lite_option.nnadapter_model_cache_dir = os.path.join(model_dir, model_prefix) + option.paddle_lite_option.nnadapter_context_properties = ( "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format( device_id ) @@ -166,15 +163,12 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 option = fd.RuntimeOption() option.use_trt_backend() option.use_gpu(device_id) - option.enable_trt_fp16() - option.set_trt_max_workspace_size(workspace) + option.trt_option.enable_fp16 = True + option.trt_option.max_workspace_size = workspace if dynamic_shape is not None: for key, shape_dict in dynamic_shape.items(): - option.set_trt_input_shape( - key, - min_shape=shape_dict["min_shape"], - opt_shape=shape_dict.get("opt_shape", None), - max_shape=shape_dict.get("max_shape", None), + option.trt_option.set_shape( + key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), shape_dict.get("max_shape", None) ) if model_format == "paddle": model_file = os.path.join(model_dir, model_prefix, "inference.pdmodel") @@ -184,7 +178,7 @@ def create_trt_runtime(model_dir, model_prefix, model_format, workspace=(1 << 31 onnx_file = os.path.join(model_dir, model_prefix, "inference.onnx") option.set_model_path(onnx_file, model_format=ModelFormat.ONNX) cache_file = os.path.join(model_dir, model_prefix, "inference.trt") - option.set_trt_cache_file(cache_file) + option.trt_option.serialize_file = cache_file return fd.Runtime(option)