From 1830f8739bebe6ec70bb55301c75463975cbb9cc Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Wed, 29 Nov 2023 11:39:01 +0800
Subject: [PATCH 01/19] Update lora

---
 ppdiffusers/ppdiffusers/loaders.py            |  597 ++++--
 ppdiffusers/ppdiffusers/models/activations.py |   94 +-
 ppdiffusers/ppdiffusers/models/attention.py   |  322 +--
 .../ppdiffusers/models/attention_processor.py | 1049 ++++++----
 ppdiffusers/ppdiffusers/models/embeddings.py  |  269 ++-
 ppdiffusers/ppdiffusers/models/lora.py        |  229 +-
 .../ppdiffusers/models/normalization.py       |  150 ++
 .../ppdiffusers/models/prior_transformer.py   |   34 +-
 ppdiffusers/ppdiffusers/models/resnet.py      |  115 +-
 .../ppdiffusers/models/transformer_2d.py      |  160 +-
 .../ppdiffusers/models/unet_1d_blocks.py      |  104 +-
 ppdiffusers/ppdiffusers/models/unet_2d.py     |   19 +-
 .../ppdiffusers/models/unet_2d_blocks.py      |  620 ++++--
 .../ppdiffusers/models/unet_2d_condition.py   |  254 ++-
 .../ppdiffusers/models/unet_3d_blocks.py      |  898 ++++++++
 .../ppdiffusers/models/unet_3d_condition.py   |   88 +-
 .../pipeline_stable_diffusion.py              |   76 +-
 .../pipeline_stable_diffusion_xl.py           |  119 +-
 ppdiffusers/ppdiffusers/utils/__init__.py     |    1 +
 ppdiffusers/ppdiffusers/utils/constants.py    |    3 +
 ppdiffusers/ppdiffusers/utils/paddle_utils.py |   60 +
 ppdiffusers/tests/lora/test_lora_layers.py    | 1853 +++++++++++++++++
 ppdiffusers/tests/models/test_activations.py  |   62 +
 ppdiffusers/tests/models/test_lora_layers.py  |  735 -------
 .../tests/models/test_modeling_common.py      |   13 +-
 .../tests/models/test_models_unet_1d.py       |    3 +
 .../tests/models/test_models_unet_2d.py       |   29 +
 .../models/test_models_unet_2d_condition.py   |  202 +-
 .../models/test_models_unet_3d_condition.py   |  207 --
 29 files changed, 6169 insertions(+), 2196 deletions(-)
 create mode 100644 ppdiffusers/ppdiffusers/models/normalization.py
 create mode 100644 ppdiffusers/tests/lora/test_lora_layers.py
 create mode 100644 ppdiffusers/tests/models/test_activations.py
 delete mode 100644 ppdiffusers/tests/models/test_lora_layers.py

diff --git a/ppdiffusers/ppdiffusers/loaders.py b/ppdiffusers/ppdiffusers/loaders.py
index cc4a71303..5dc45476f 100644
--- a/ppdiffusers/ppdiffusers/loaders.py
+++ b/ppdiffusers/ppdiffusers/loaders.py
@@ -17,13 +17,15 @@
 import re
 import warnings
 from collections import defaultdict
+from contextlib import nullcontext
 from io import BytesIO
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 
 import paddle
 import paddle.nn as nn
-import paddle.nn.functional as F
+
+# import paddle.nn.functional as F  # TODO: remove
 import requests
 from huggingface_hub import hf_hub_download
 from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
@@ -66,6 +68,8 @@
         PretrainedTokenizer,
     )
 
+logger = logging.get_logger(__name__)
+
 TEXT_ENCODER_NAME = "text_encoder"
 UNET_NAME = "unet"
 
@@ -117,7 +121,73 @@ def __init__(self, regular_linear_layer, lora_scale=1, network_alpha=None, rank=
 
         self.lora_scale = lora_scale
 
+    # overwrite PyTorch's `state_dict` to be sure that only the 'regular_linear_layer' weights are saved
+    # when saving the whole text encoder model and when LoRA is unloaded or fused
+    def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer.state_dict(
+                *args, destination=destination, prefix=prefix, keep_vars=keep_vars
+            )
+
+        return super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def _fuse_lora(self, lora_scale=1.0):
+        if self.lora_linear_layer is None:
+            return
+
+        dtype = self.regular_linear_layer.weight.dtype
+
+        w_orig = self.regular_linear_layer.weight.astype(paddle.get_default_dtype())
+        w_up = self.lora_linear_layer.up.weight.astype(paddle.get_default_dtype())
+        w_down = self.lora_linear_layer.down.weight.astype(paddle.get_default_dtype())
+
+        if self.lora_linear_layer.network_alpha is not None:
+            w_up = w_up * self.lora_linear_layer.network_alpha / self.lora_linear_layer.rank
+
+        # Weight is reversed in the torch and paddle, use .T
+        fused_weight = w_orig + (lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
+        out_0 = fused_weight.cast(dtype=dtype)
+        self.regular_linear_layer.weight = self.create_parameter(
+            shape=out_0.shape,
+            default_initializer=nn.initializer.Assign(out_0),
+        )
+
+        # we can drop the lora layer now
+        self.lora_linear_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self.lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (hasattr(self, "w_up") and hasattr(self, "w_down")):
+            return
+
+        fused_weight = self.regular_linear_layer.weight
+        dtype = fused_weight.dtype
+
+        w_up = self.w_up.astype(paddle.get_default_dtype())
+        w_down = self.w_down.astype(paddle.get_default_dtype())
+
+        unfused_weight = (
+            fused_weight.astype(paddle.get_default_dtype())
+            - (self.lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
+        )
+        out_0 = unfused_weight.cast(dtype=dtype)
+        self.regular_linear_layer.weight = self.create_parameter(
+            shape=out_0.shape,
+            default_initializer=nn.initializer.Assign(out_0),
+        )
+
+        self.w_up = None
+        self.w_down = None
+
     def forward(self, input):
+        if self.lora_scale is None:
+            self.lora_scale = 1.0
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer(input)
         return self.regular_linear_layer(input) + self.lora_scale * self.lora_linear_layer(input)
 
 
@@ -257,17 +327,19 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
                 information.
         """
-        from .models.attention_processor import (
-            AttnAddedKVProcessor,
-            AttnAddedKVProcessor2_5,
-            CustomDiffusionAttnProcessor,
-            LoRAAttnAddedKVProcessor,
-            LoRAAttnProcessor,
-            LoRAAttnProcessor2_5,
-            LoRAXFormersAttnProcessor,
-            SlicedAttnAddedKVProcessor,
-            XFormersAttnProcessor,
-        )
+        # TODO: remove
+        # from .models.attention_processor import (
+        #     AttnAddedKVProcessor,
+        #     AttnAddedKVProcessor2_5,
+        #     CustomDiffusionAttnProcessor,
+        #     LoRAAttnAddedKVProcessor,
+        #     LoRAAttnProcessor,
+        #     LoRAAttnProcessor2_5,
+        #     LoRAXFormersAttnProcessor,
+        #     SlicedAttnAddedKVProcessor,
+        #     XFormersAttnProcessor,
+        # )
+        from .models.attention_processor import CustomDiffusionAttnProcessor
         from .models.lora import (
             LoRACompatibleConv,
             LoRACompatibleLinear,
@@ -292,6 +364,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
         # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
         network_alphas = kwargs.pop("network_alphas", None)
+        is_network_alphas_none = network_alphas is None
 
         if from_diffusers and use_safetensors and not is_safetensors_available():
             raise ValueError(
@@ -368,8 +441,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             state_dict = pretrained_model_name_or_path_or_dict
 
         # fill attn processors
-        attn_processors = {}
-        non_attn_lora_layers = []
+        lora_layers_list = []
 
         is_lora = all(("lora" in k or k.endswith(".alpha")) for k in state_dict.keys())
         is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
@@ -378,17 +450,13 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             state_dict = transpose_state_dict(state_dict)
 
         if is_lora:
-            is_new_lora_format = all(
-                key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
-            )
-            if is_new_lora_format:
-                # Strip the `"unet"` prefix.
-                is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
-                if is_text_encoder_present:
-                    warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
-                    warnings.warn(warn_message)
-                unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
-                state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
+            # attn_processors = {}  # TODO: remove
+            # correct keys
+            state_dict, network_alphas = self.convert_state_dict_legacy_attn_format(state_dict, network_alphas)
+
+            if network_alphas is not None:
+                network_alphas_keys = list(network_alphas.keys())
+                used_network_alphas_keys = set()
 
             lora_grouped_dict = defaultdict(dict)
             mapped_network_alphas = {}
@@ -403,13 +471,20 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
                 # Create another `mapped_network_alphas` dictionary so that we can properly map them.
                 if network_alphas is not None:
-                    for k in network_alphas:
+                    for k in network_alphas_keys:
                         if k.replace(".alpha", "") in key:
-                            mapped_network_alphas.update({attn_processor_key: network_alphas[k]})
+                            mapped_network_alphas.update({attn_processor_key: network_alphas.get(k)})
+                            used_network_alphas_keys.add(k)
+
+            if not is_network_alphas_none:
+                if len(set(network_alphas_keys) - used_network_alphas_keys) > 0:
+                    raise ValueError(
+                        f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+                    )
 
             if len(state_dict) > 0:
                 raise ValueError(
-                    f"The state_dict has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+                    f"The `state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
                 )
 
             for key, value_dict in lora_grouped_dict.items():
@@ -419,7 +494,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
                 # Process non-attention layers, which don't have to_{k,v,q,out_proj}_lora layers
                 # or add_{k,v,q,out_proj}_proj_lora layers.
-                if "lora.down.weight" in value_dict:
+                if "lora.down.weight" in value_dict:  # TODO: remove this line
                     if value_dict["lora.down.weight"].ndim == 2:
                         rank = value_dict["lora.down.weight"].shape[1]
                     else:
@@ -451,66 +526,67 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
 
                     value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
                     lora.load_dict(value_dict)
-                    non_attn_lora_layers.append((attn_processor, lora))
-                else:
-                    # To handle SDXL.
-                    rank_mapping = {}
-                    hidden_size_mapping = {}
-                    for projection_id in ["to_k", "to_q", "to_v", "to_out"]:
-                        rank = value_dict[f"{projection_id}_lora.down.weight"].shape[1]
-                        hidden_size = value_dict[f"{projection_id}_lora.up.weight"].shape[1]
-
-                        rank_mapping.update({f"{projection_id}_lora.down.weight": rank})
-                        hidden_size_mapping.update({f"{projection_id}_lora.up.weight": hidden_size})
-
-                    if isinstance(
-                        attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5)
-                    ):
-                        cross_attention_dim = value_dict["add_k_proj_lora.down.weight"].shape[0]
-                        attn_processor_class = LoRAAttnAddedKVProcessor
-                    else:
-                        cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]
-                        if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)):
-                            attn_processor_class = LoRAXFormersAttnProcessor
-                        else:
-                            attn_processor_class = (
-                                LoRAAttnProcessor2_5
-                                if hasattr(F, "scaled_dot_product_attention_")
-                                else LoRAAttnProcessor
-                            )
-
-                    if attn_processor_class is not LoRAAttnAddedKVProcessor:
-                        attn_processors[key] = attn_processor_class(
-                            rank=rank_mapping.get("to_k_lora.down.weight"),
-                            hidden_size=hidden_size_mapping.get("to_k_lora.up.weight"),
-                            cross_attention_dim=cross_attention_dim,
-                            network_alpha=mapped_network_alphas.get(key),
-                            q_rank=rank_mapping.get("to_q_lora.down.weight"),
-                            q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight"),
-                            v_rank=rank_mapping.get("to_v_lora.down.weight"),
-                            v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight"),
-                            out_rank=rank_mapping.get("to_out_lora.down.weight"),
-                            out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight"),
-                            # rank=rank_mapping.get("to_k_lora.down.weight", None),
-                            # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
-                            # q_rank=rank_mapping.get("to_q_lora.down.weight", None),
-                            # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight", None),
-                            # v_rank=rank_mapping.get("to_v_lora.down.weight", None),
-                            # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight", None),
-                            # out_rank=rank_mapping.get("to_out_lora.down.weight", None),
-                            # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight", None),
-                        )
-                    else:
-                        attn_processors[key] = attn_processor_class(
-                            rank=rank_mapping.get("to_k_lora.down.weight", None),
-                            hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
-                            cross_attention_dim=cross_attention_dim,
-                            network_alpha=mapped_network_alphas.get(key),
-                        )
-
-                    attn_processors[key].load_dict(value_dict)
-
+                    lora_layers_list.append((attn_processor, lora))
+                # TODO: remove
+                # else:
+                #     # To handle SDXL.
+                #     rank_mapping = {}
+                #     hidden_size_mapping = {}
+                #     for projection_id in ["to_k", "to_q", "to_v", "to_out"]:
+                #         rank = value_dict[f"{projection_id}_lora.down.weight"].shape[1]
+                #         hidden_size = value_dict[f"{projection_id}_lora.up.weight"].shape[1]
+
+                #         rank_mapping.update({f"{projection_id}_lora.down.weight": rank})
+                #         hidden_size_mapping.update({f"{projection_id}_lora.up.weight": hidden_size})
+
+                #     if isinstance(
+                #         attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5)
+                #     ):
+                #         cross_attention_dim = value_dict["add_k_proj_lora.down.weight"].shape[0]
+                #         attn_processor_class = LoRAAttnAddedKVProcessor
+                #     else:
+                #         cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]
+                #         if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)):
+                #             attn_processor_class = LoRAXFormersAttnProcessor
+                #         else:
+                #             attn_processor_class = (
+                #                 LoRAAttnProcessor2_5
+                #                 if hasattr(F, "scaled_dot_product_attention_")
+                #                 else LoRAAttnProcessor
+                #             )
+
+                #     if attn_processor_class is not LoRAAttnAddedKVProcessor:
+                #         attn_processors[key] = attn_processor_class(
+                #             rank=rank_mapping.get("to_k_lora.down.weight"),
+                #             hidden_size=hidden_size_mapping.get("to_k_lora.up.weight"),
+                #             cross_attention_dim=cross_attention_dim,
+                #             network_alpha=mapped_network_alphas.get(key),
+                #             q_rank=rank_mapping.get("to_q_lora.down.weight"),
+                #             q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight"),
+                #             v_rank=rank_mapping.get("to_v_lora.down.weight"),
+                #             v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight"),
+                #             out_rank=rank_mapping.get("to_out_lora.down.weight"),
+                #             out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight"),
+                #             # rank=rank_mapping.get("to_k_lora.down.weight", None),
+                #             # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
+                #             # q_rank=rank_mapping.get("to_q_lora.down.weight", None),
+                #             # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight", None),
+                #             # v_rank=rank_mapping.get("to_v_lora.down.weight", None),
+                #             # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight", None),
+                #             # out_rank=rank_mapping.get("to_out_lora.down.weight", None),
+                #             # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight", None),
+                #         )
+                #     else:
+                #         attn_processors[key] = attn_processor_class(
+                #             rank=rank_mapping.get("to_k_lora.down.weight", None),
+                #             hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
+                #             cross_attention_dim=cross_attention_dim,
+                #             network_alpha=mapped_network_alphas.get(key),
+                #         )
+
+                #     attn_processors[key].load_dict(value_dict)
         elif is_custom_diffusion:
+            attn_processors = {}
             custom_diffusion_grouped_dict = defaultdict(dict)
             for key, value in state_dict.items():
                 if len(value) == 0:
@@ -544,23 +620,47 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                         cross_attention_dim=cross_attention_dim,
                     )
                     attn_processors[key].load_dict(value_dict)
+
+            self.set_attn_processor(attn_processors)
         else:
             raise ValueError(
                 f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
             )
-        # set correct dtype & device
-        attn_processors = {k: v.to(dtype=self.dtype) for k, v in attn_processors.items()}
-        non_attn_lora_layers = [(t, l.to(dtype=self.dtype)) for t, l in non_attn_lora_layers]
-        # set layers
-        self.set_attn_processor(attn_processors)
-
-        # set ff layers
-        for target_module, lora_layer in non_attn_lora_layers:
+
+        # set lora layers
+        for target_module, lora_layer in lora_layers_list:
             target_module.set_lora_layer(lora_layer)
             # It should raise an error if we don't have a set lora here
             # if hasattr(target_module, "set_lora_layer"):
             #     target_module.set_lora_layer(lora_layer)
 
+    def convert_state_dict_legacy_attn_format(self, state_dict, network_alphas):
+        is_new_lora_format = all(
+            key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+        )
+        if is_new_lora_format:
+            # Strip the `"unet"` prefix.
+            is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
+            if is_text_encoder_present:
+                warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
+                logger.warn(warn_message)
+            unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+            state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
+
+        # change processor format to 'pure' LoRACompatibleLinear format
+        if any("processor" in k.split(".") for k in state_dict.keys()):
+
+            def format_to_lora_compatible(key):
+                if "processor" not in key.split("."):
+                    return key
+                return key.replace(".processor", "").replace("to_out_lora", "to_out.0.lora").replace("_lora", ".lora")
+
+            state_dict = {format_to_lora_compatible(k): v for k, v in state_dict.items()}
+
+            if network_alphas is not None:
+                network_alphas = {format_to_lora_compatible(k): v for k, v in network_alphas.items()}
+        return state_dict, network_alphas
+
     def save_attn_procs(
         self,
         save_directory: Union[str, os.PathLike],
@@ -662,6 +762,21 @@ def save_function(weights, filename):
 
         logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
 
+    def fuse_lora(self, lora_scale=1.0):
+        self.lora_scale = lora_scale
+        self.apply(self._fuse_lora_apply)
+
+    def _fuse_lora_apply(self, module):
+        if hasattr(module, "_fuse_lora"):
+            module._fuse_lora(self.lora_scale)
+
+    def unfuse_lora(self):
+        self.apply(self._unfuse_lora_apply)
+
+    def _unfuse_lora_apply(self, module):
+        if hasattr(module, "_unfuse_lora"):
+            module._unfuse_lora()
+
 
 class TextualInversionLoaderMixin:
     r"""
@@ -673,6 +788,15 @@ def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "Pretra
         Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to
         be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
         inversion token or if the textual inversion token is a single vector, the input prompt is returned.
+
+        Parameters:
+            prompt (`str` or list of `str`):
+                The prompt or prompts to guide the image generation.
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer responsible for encoding the prompt into input tokens.
+
+        Returns:
+            `str` or list of `str`: The converted prompt
         """
         if not isinstance(prompt, List):
             prompts = [prompt]
@@ -686,7 +810,7 @@ def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "Pretra
 
         return prompts
 
-    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"):
+    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
         r"""
         Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
         to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
@@ -695,7 +819,7 @@ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"):
         Parameters:
             prompt (`str`):
                 The prompt to guide the image generation.
-            tokenizer (`PretrainedTokenizer`):
+            tokenizer (`PreTrainedTokenizer`):
                 The tokenizer responsible for encoding the prompt into input tokens.
         Returns:
             `str`: The converted prompt
@@ -718,6 +842,8 @@ def load_textual_inversion(
         self,
         pretrained_model_name_or_path: Union[str, List[str], Dict[str, paddle.Tensor], List[Dict[str, paddle.Tensor]]],
         token: Optional[Union[str, List[str]]] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa: F821
+        text_encoder: Optional["PreTrainedModel"] = None,  # noqa: F821
         **kwargs,
     ):
         r"""
@@ -736,6 +862,11 @@ def load_textual_inversion(
             token (`str` or `List[str]`, *optional*):
                 Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
                 list, then `token` must also be a list of equal length.
+            text_encoder ([`~transformers.CLIPTextModel`], *optional*):
+                Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+                If not specified, function will take self.tokenizer.
+            tokenizer ([`~transformers.CLIPTokenizer`], *optional*):
+                A `CLIPTokenizer` to tokenize text. If not specified, function will take self.tokenizer.
             weight_name (`str`, *optional*):
                 Name of a custom weight file. This should be used when:
                     - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight
@@ -793,6 +924,9 @@ def load_textual_inversion(
         image.save("character.png")
         ```
         """
+        tokenizer = tokenizer or getattr(self, "tokenizer", None)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+
         if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PretrainedTokenizer):
             raise ValueError(
                 f"{self.__class__.__name__} requires `self.tokenizer` of type `PretrainedTokenizer` for calling"
@@ -1000,6 +1134,7 @@ class LoraLoaderMixin:
     """
     text_encoder_name = TEXT_ENCODER_NAME
     unet_name = UNET_NAME
+    num_fused_loras = 0
 
     def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], **kwargs):
         """
@@ -1372,22 +1507,23 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
                 How much to scale the output of the lora linear layer before it is added with the output of the regular
                 lora layer.
         """
-
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
         # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
         # their prefixes.
         keys = list(state_dict.keys())
         prefix = cls.text_encoder_name if prefix is None else prefix
 
+        # Safe prefix to check with.
         if any(cls.text_encoder_name in key for key in keys):
             # Load the layers corresponding to text encoder and make necessary adjustments.
-            text_encoder_keys = [k for k in keys if k.startswith(prefix)]
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
             text_encoder_lora_state_dict = {
                 k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
             }
 
             if len(text_encoder_lora_state_dict) > 0:
                 logger.info(f"Loading {prefix}.")
+                rank = {}
 
                 if any("to_out_lora" in k for k in text_encoder_lora_state_dict.keys()):
                     # Convert from the old naming convention to the new naming convention.
@@ -1426,23 +1562,34 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
                             f"{name}.out_proj.lora_linear_layer.down.weight"
                         ] = text_encoder_lora_state_dict.pop(f"{name}.to_out_lora.down.weight")
 
-                rank = text_encoder_lora_state_dict[
-                    "text_model.transformer.layers.0.self_attn.out_proj.lora_linear_layer.up.weight"
-                ].shape[0]
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    rank_key = f"{name}.out_proj.lora_linear_layer.up.weight"
+                    # torch use 1, paddle use 0
+                    rank.update({rank_key: text_encoder_lora_state_dict[rank_key].shape[0]})
+
                 patch_mlp = any(".linear1." in key for key in text_encoder_lora_state_dict.keys())
+                if patch_mlp:
+                    for name, _ in text_encoder_mlp_modules(text_encoder):
+                        rank_key_fc1 = f"{name}.linear1.lora_linear_layer.up.weight"
+                        rank_key_fc2 = f"{name}.linear2.lora_linear_layer.up.weight"
+                        rank.update({rank_key_fc1: text_encoder_lora_state_dict[rank_key_fc1].shape[0]})
+                        rank.update({rank_key_fc2: text_encoder_lora_state_dict[rank_key_fc2].shape[0]})
 
-                cls._modify_text_encoder(
-                    text_encoder,
-                    lora_scale,
-                    network_alphas,
-                    rank=rank,
-                    patch_mlp=patch_mlp,
-                )
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                cls._modify_text_encoder(text_encoder, lora_scale, network_alphas, rank=rank, patch_mlp=patch_mlp)
 
                 # set correct dtype & device
                 text_encoder_lora_state_dict = {
                     k: v._to(dtype=text_encoder.dtype) for k, v in text_encoder_lora_state_dict.items()
                 }
+
                 text_encoder.load_dict(text_encoder_lora_state_dict)
                 # load_state_dict_results = text_encoder.load_dict(text_encoder_lora_state_dict)
                 # if len(load_state_dict_results.unexpected_keys) != 0:
@@ -1463,16 +1610,16 @@ def _remove_text_encoder_monkey_patch(self):
     def _remove_text_encoder_monkey_patch_classmethod(cls, text_encoder):
         for _, attn_module in text_encoder_attn_modules(text_encoder):
             if isinstance(attn_module.q_proj, PatchedLoraProjection):
-                attn_module.q_proj = attn_module.q_proj.regular_linear_layer
-                attn_module.k_proj = attn_module.k_proj.regular_linear_layer
-                attn_module.v_proj = attn_module.v_proj.regular_linear_layer
-                attn_module.out_proj = attn_module.out_proj.regular_linear_layer
+                attn_module.q_proj.lora_linear_layer = None
+                attn_module.k_proj.lora_linear_layer = None
+                attn_module.v_proj.lora_linear_layer = None
+                attn_module.out_proj.lora_linear_layer = None
 
         for _, mlp_module in text_encoder_mlp_modules(text_encoder):
             if isinstance(mlp_module.linear1, PatchedLoraProjection):
-                mlp_module.linear1 = mlp_module.linear1.regular_linear_layer
+                mlp_module.linear1.lora_linear_layer = None
             if isinstance(mlp_module.linear2, PatchedLoraProjection):
-                mlp_module.linear2 = mlp_module.linear2.regular_linear_layer
+                mlp_module.linear2.lora_linear_layer = None
 
     # @classmethod
     # def _modify_text_encoder(
@@ -1587,7 +1734,7 @@ def _modify_text_encoder(
         text_encoder,
         lora_scale=1,
         network_alphas=None,
-        rank=4,
+        rank: Union[Dict[str, int], int] = 4,
         dtype=None,
         patch_mlp=False,
     ):
@@ -1595,52 +1742,65 @@ def _modify_text_encoder(
         Monkey-patches the forward passes of attention modules of the text encoder.
         """
 
+        def create_patched_linear_lora(model, network_alpha, rank, dtype, lora_parameters):
+            linear_layer = model.regular_linear_layer if isinstance(model, PatchedLoraProjection) else model
+            ctx = nullcontext
+            with ctx():
+                model = PatchedLoraProjection(linear_layer, lora_scale, network_alpha, rank, dtype=dtype)
+
+            lora_parameters.extend(model.lora_linear_layer.parameters())
+            return model
+
         # First, remove any monkey-patch that might have been applied before
         cls._remove_text_encoder_monkey_patch_classmethod(text_encoder)
 
         lora_parameters = []
         network_alphas = {} if network_alphas is None else network_alphas
+        is_network_alphas_populated = len(network_alphas) > 0
 
         for name, attn_module in text_encoder_attn_modules(text_encoder):
-            query_alpha = network_alphas.get(name + ".k.proj.alpha")
-            key_alpha = network_alphas.get(name + ".q.proj.alpha")
-            value_alpha = network_alphas.get(name + ".v.proj.alpha")
-            proj_alpha = network_alphas.get(name + ".out.proj.alpha")
+            query_alpha = network_alphas.pop(name + ".to_q_lora.down.weight.alpha", None)
+            key_alpha = network_alphas.pop(name + ".to_k_lora.down.weight.alpha", None)
+            value_alpha = network_alphas.pop(name + ".to_v_lora.down.weight.alpha", None)
+            out_alpha = network_alphas.pop(name + ".to_out_lora.down.weight.alpha", None)
 
-            attn_module.q_proj = PatchedLoraProjection(
-                attn_module.q_proj, lora_scale, network_alpha=query_alpha, rank=rank, dtype=dtype
-            )
-            lora_parameters.extend(attn_module.q_proj.lora_linear_layer.parameters())
+            if isinstance(rank, dict):
+                current_rank = rank.pop(f"{name}.out_proj.lora_linear_layer.up.weight")
+            else:
+                current_rank = rank
 
-            attn_module.k_proj = PatchedLoraProjection(
-                attn_module.k_proj, lora_scale, network_alpha=key_alpha, rank=rank, dtype=dtype
+            attn_module.q_proj = create_patched_linear_lora(
+                attn_module.q_proj, query_alpha, current_rank, dtype, lora_parameters
             )
-            lora_parameters.extend(attn_module.k_proj.lora_linear_layer.parameters())
-
-            attn_module.v_proj = PatchedLoraProjection(
-                attn_module.v_proj, lora_scale, network_alpha=value_alpha, rank=rank, dtype=dtype
+            attn_module.k_proj = create_patched_linear_lora(
+                attn_module.k_proj, key_alpha, current_rank, dtype, lora_parameters
             )
-            lora_parameters.extend(attn_module.v_proj.lora_linear_layer.parameters())
-
-            attn_module.out_proj = PatchedLoraProjection(
-                attn_module.out_proj, lora_scale, network_alpha=proj_alpha, rank=rank, dtype=dtype
+            attn_module.v_proj = create_patched_linear_lora(
+                attn_module.v_proj, value_alpha, current_rank, dtype, lora_parameters
+            )
+            attn_module.out_proj = create_patched_linear_lora(
+                attn_module.out_proj, out_alpha, current_rank, dtype, lora_parameters
             )
-            lora_parameters.extend(attn_module.out_proj.lora_linear_layer.parameters())
 
         if patch_mlp:
             for name, mlp_module in text_encoder_mlp_modules(text_encoder):
-                fc1_alpha = network_alphas.get(name + ".linear1.alpha")
-                fc2_alpha = network_alphas.get(name + ".linear2.alpha")
+                fc1_alpha = network_alphas.pop(name + ".linear1.lora_linear_layer.down.weight.alpha", None)
+                fc2_alpha = network_alphas.pop(name + ".linear2.lora_linear_layer.down.weight.alpha", None)
 
-                mlp_module.linear1 = PatchedLoraProjection(
-                    mlp_module.linear1, lora_scale, network_alpha=fc1_alpha, rank=rank, dtype=dtype
-                )
-                lora_parameters.extend(mlp_module.linear1.lora_linear_layer.parameters())
+                current_rank_fc1 = rank.pop(f"{name}.linear1.lora_linear_layer.up.weight")
+                current_rank_fc2 = rank.pop(f"{name}.linear2.lora_linear_layer.up.weight")
 
-                mlp_module.linear2 = PatchedLoraProjection(
-                    mlp_module.linear2, lora_scale, network_alpha=fc2_alpha, rank=rank, dtype=dtype
+                mlp_module.linear1 = create_patched_linear_lora(
+                    mlp_module.linear1, fc1_alpha, current_rank_fc1, dtype, lora_parameters
+                )
+                mlp_module.linear2 = create_patched_linear_lora(
+                    mlp_module.linear2, fc2_alpha, current_rank_fc2, dtype, lora_parameters
                 )
-                lora_parameters.extend(mlp_module.linear2.lora_linear_layer.parameters())
+
+        if is_network_alphas_populated and len(network_alphas) > 0:
+            raise ValueError(
+                f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+            )
 
         return lora_parameters
 
@@ -1937,39 +2097,128 @@ def unload_lora_weights(self):
         >>> ...
         ```
         """
-        from .models.attention_processor import (
-            LORA_ATTENTION_PROCESSORS,
-            AttnProcessor,
-            AttnProcessor2_5,
-            LoRAAttnAddedKVProcessor,
-            LoRAAttnProcessor,
-            LoRAAttnProcessor2_5,
-            LoRAXFormersAttnProcessor,
-            XFormersAttnProcessor,
-        )
+        # TODO: remove
+        # from .models.attention_processor import (
+        #     LORA_ATTENTION_PROCESSORS,
+        #     AttnProcessor,
+        #     AttnProcessor2_5,
+        #     LoRAAttnAddedKVProcessor,
+        #     LoRAAttnProcessor,
+        #     LoRAAttnProcessor2_5,
+        #     LoRAXFormersAttnProcessor,
+        #     XFormersAttnProcessor,
+        # )
+
+        # unet_attention_classes = {type(processor) for _, processor in self.unet.attn_processors.items()}
+
+        # if unet_attention_classes.issubset(LORA_ATTENTION_PROCESSORS):
+        #     # Handle attention processors that are a mix of regular attention and AddedKV
+        #     # attention.
+        #     if len(unet_attention_classes) > 1 or LoRAAttnAddedKVProcessor in unet_attention_classes:
+        #         self.unet.set_default_attn_processor()
+        #     else:
+        #         regular_attention_classes = {
+        #             LoRAAttnProcessor: AttnProcessor,
+        #             LoRAAttnProcessor2_5: AttnProcessor2_5,
+        #             LoRAXFormersAttnProcessor: XFormersAttnProcessor,
+        #         }
+        #         [attention_proc_class] = unet_attention_classes
+        #         self.unet.set_attn_processor(regular_attention_classes[attention_proc_class]())
+
+        for _, module in self.unet.named_sublayers():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+        # Safe to call the following regardless of LoRA.
+        self._remove_text_encoder_monkey_patch()
 
-        unet_attention_classes = {type(processor) for _, processor in self.unet.attn_processors.items()}
+    def fuse_lora(self, fuse_unet: bool = True, fuse_text_encoder: bool = True, lora_scale: float = 1.0):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
 
-        if unet_attention_classes.issubset(LORA_ATTENTION_PROCESSORS):
-            # Handle attention processors that are a mix of regular attention and AddedKV
-            # attention.
-            if len(unet_attention_classes) > 1 or LoRAAttnAddedKVProcessor in unet_attention_classes:
-                self.unet.set_default_attn_processor()
-            else:
-                regular_attention_classes = {
-                    LoRAAttnProcessor: AttnProcessor,
-                    LoRAAttnProcessor2_5: AttnProcessor2_5,
-                    LoRAXFormersAttnProcessor: XFormersAttnProcessor,
-                }
-                [attention_proc_class] = unet_attention_classes
-                self.unet.set_attn_processor(regular_attention_classes[attention_proc_class]())
+        <Tip warning={true}>
 
-            for _, module in self.unet.named_sublayers():
-                if hasattr(module, "set_lora_layer"):
-                    module.set_lora_layer(None)
+        This is an experimental API.
 
-        # Safe to call the following regardless of LoRA.
-        self._remove_text_encoder_monkey_patch()
+        </Tip>
+
+        Args:
+            fuse_unet (`bool`, defaults to `True`): Whether to fuse the UNet LoRA parameters.
+            fuse_text_encoder (`bool`, defaults to `True`):
+                Whether to fuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+        """
+        if fuse_unet or fuse_text_encoder:
+            self.num_fused_loras += 1
+            if self.num_fused_loras > 1:
+                logger.warn(
+                    "The current API is supported for operating with a single LoRA file. You are trying to load and fuse more than one LoRA which is not well-supported.",
+                )
+
+        if fuse_unet:
+            self.unet.fuse_lora(lora_scale)
+
+        def fuse_text_encoder_lora(text_encoder):
+            for _, attn_module in text_encoder_attn_modules(text_encoder):
+                if isinstance(attn_module.q_proj, PatchedLoraProjection):
+                    attn_module.q_proj._fuse_lora(lora_scale)
+                    attn_module.k_proj._fuse_lora(lora_scale)
+                    attn_module.v_proj._fuse_lora(lora_scale)
+                    attn_module.out_proj._fuse_lora(lora_scale)
+
+            for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+                if isinstance(mlp_module.linear1, PatchedLoraProjection):
+                    mlp_module.linear1._fuse_lora(lora_scale)
+                    mlp_module.linear2._fuse_lora(lora_scale)
+
+        if fuse_text_encoder:
+            if hasattr(self, "text_encoder"):
+                fuse_text_encoder_lora(self.text_encoder)
+            if hasattr(self, "text_encoder_2"):
+                fuse_text_encoder_lora(self.text_encoder_2)
+
+    def unfuse_lora(self, unfuse_unet: bool = True, unfuse_text_encoder: bool = True):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        if unfuse_unet:
+            self.unet.unfuse_lora()
+
+        def unfuse_text_encoder_lora(text_encoder):
+            for _, attn_module in text_encoder_attn_modules(text_encoder):
+                if isinstance(attn_module.q_proj, PatchedLoraProjection):
+                    attn_module.q_proj._unfuse_lora()
+                    attn_module.k_proj._unfuse_lora()
+                    attn_module.v_proj._unfuse_lora()
+                    attn_module.out_proj._unfuse_lora()
+
+            for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+                if isinstance(mlp_module.linear1, PatchedLoraProjection):
+                    mlp_module.linear1._unfuse_lora()
+                    mlp_module.linear2._unfuse_lora()
+
+        if unfuse_text_encoder:
+            if hasattr(self, "text_encoder"):
+                unfuse_text_encoder_lora(self.text_encoder)
+            if hasattr(self, "text_encoder_2"):
+                unfuse_text_encoder_lora(self.text_encoder_2)
+
+        self.num_fused_loras -= 1
 
 
 class FromSingleFileMixin:
diff --git a/ppdiffusers/ppdiffusers/models/activations.py b/ppdiffusers/ppdiffusers/models/activations.py
index 8091cb327..61d66cc4b 100644
--- a/ppdiffusers/ppdiffusers/models/activations.py
+++ b/ppdiffusers/ppdiffusers/models/activations.py
@@ -13,14 +13,94 @@
 # limitations under the License.
 
 import paddle.nn as nn
+import paddle.nn.functional as F
 
+from ..utils import USE_PEFT_BACKEND
+from .lora import LoRACompatibleLinear
 
-def get_activation(act_fn):
-    if act_fn in ["swish", "silu"]:
-        return nn.Silu()
-    elif act_fn == "mish":
-        return nn.Mish()
-    elif act_fn == "gelu":
-        return nn.GELU()
+ACTIVATION_FUNCTIONS = {
+    "swish": nn.Silu(),
+    "silu": nn.Silu(),
+    "mish": nn.Mish(),
+    "gelu": nn.GELU(),
+    "relu": nn.ReLU(),
+}
+
+
+def get_activation(act_fn: str) -> nn.Layer:
+    """Helper function to get activation function from string.
+
+    Args:
+        act_fn (str): Name of activation function.
+
+    Returns:
+        nn.Layer: Activation function.
+    """
+
+    act_fn = act_fn.lower()
+    if act_fn in ACTIVATION_FUNCTIONS:
+        return ACTIVATION_FUNCTIONS[act_fn]
     else:
         raise ValueError(f"Unsupported activation function: {act_fn}")
+
+
+class GELU(nn.Layer):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+        super().__init__()
+        self.proj = LoRACompatibleLinear(dim_in, dim_out)
+        self.approximate = approximate
+        self.approximate_bool = approximate == "tanh"
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = F.gelu(hidden_states, approximate=self.approximate_bool)
+        return hidden_states
+
+
+class GEGLU(nn.Layer):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
+
+        self.proj = linear_cls(dim_in, dim_out * 2)
+
+    def forward(self, hidden_states, scale: float = 1.0):
+        args = () if USE_PEFT_BACKEND else (scale,)
+        hidden_states, gate = self.proj(hidden_states, *args).chunk(2, axis=-1)
+        return hidden_states * F.gelu(gate)
+
+
+class ApproximateGELU(nn.Layer):
+    r"""
+    The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
+    [paper](https://arxiv.org/abs/1606.08415).
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x * F.sigmoid(1.702 * x)
diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py
index bc6f8cdcf..3343636d3 100644
--- a/ppdiffusers/ppdiffusers/models/attention.py
+++ b/ppdiffusers/ppdiffusers/models/attention.py
@@ -19,11 +19,12 @@
 import paddle.nn.functional as F
 from paddle import nn
 
-from ..utils import is_ppxformers_available
-from .activations import get_activation
+from ..utils import is_ppxformers_available, maybe_allow_in_graph
+from .activations import GEGLU, GELU, ApproximateGELU
 from .attention_processor import Attention
-from .embeddings import CombinedTimestepLabelEmbeddings
+from .embeddings import SinusoidalPositionalEmbedding
 from .lora import LoRACompatibleLinear
+from .normalization import AdaLayerNorm, AdaLayerNormZero
 
 
 def drop_path(input, drop_prob: float = 0.0, training: bool = False):
@@ -228,6 +229,56 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Layer):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+
+        tmp1 = paddle.to_tensor(0.0)
+        tmp2 = paddle.to_tensor(0.0)
+        self.register_parameter(
+            "alpha_attn",
+            self.create_parameter(tmp1.shape, tmp1.dtype, default_initializer=paddle.nn.initializer.Assign(tmp1)),
+        )
+        self.register_parameter(
+            "alpha_dense",
+            self.create_parameter(tmp2.shape, tmp2.dtype, default_initializer=paddle.nn.initializer.Assign(tmp2)),
+        )
+
+        self.enabled = True
+
+    def forward(self, x, objs):
+        if not self.enabled:
+            return x
+
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(paddle.concat([x, objs], axis=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+
+        return x
+
+
 class BasicTransformerBlock(nn.Layer):
     r"""
     A basic Transformer block.
@@ -238,15 +289,29 @@ class BasicTransformerBlock(nn.Layer):
         attention_head_dim (`int`): The number of channels in each head.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
         cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
         activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
         num_embeds_ada_norm (:
             obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
         attention_bias (:
             obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
     """
 
     def __init__(
@@ -263,14 +328,20 @@ def __init__(
         double_self_attention: bool = False,
         upcast_attention: bool = False,
         norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
+        norm_eps: float = 1e-5,
         final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
 
         self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
         self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
 
         if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
             raise ValueError(
@@ -283,6 +354,16 @@ def __init__(
         else:
             norm_kwargs = {}
 
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
         # Define 3 blocks. Each block has its own normalization layer.
         # 1. Self-Attn
         if self.use_ada_layer_norm:
@@ -323,9 +404,22 @@ def __init__(
             self.attn2 = None
 
         # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, **norm_kwargs)
+        if not self.use_ada_layer_norm_single:
+            self.norm3 = nn.LayerNorm(dim, **norm_kwargs)
+
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
 
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if self.use_ada_layer_norm_single:
+            tmp_param = paddle.randn([6, dim]) / dim**0.5
+            self.scale_shift_table = self.create_parameter(
+                shape=tmp_param.shape, default_initializer=paddle.nn.initializer.Assign(tmp_param)
+            )
+
         # let chunk size default to None
         self._chunk_size = None
         self._chunk_dim = 0
@@ -346,17 +440,37 @@ def forward(
         class_labels: Optional[paddle.Tensor] = None,
     ):
         # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
         if self.use_ada_layer_norm:
             norm_hidden_states = self.norm1(hidden_states, timestep)
         elif self.use_ada_layer_norm_zero:
             norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
                 hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
             )
-        else:
+        elif self.use_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.use_ada_layer_norm_single:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape([batch_size, 6, -1])
+            ).chunk(6, dim=1)
             norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
 
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         attn_output = self.attn1(
             norm_hidden_states,
             encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
@@ -365,14 +479,33 @@ def forward(
         )
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.use_ada_layer_norm_single:
+            attn_output = gate_msa * attn_output
+
         hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
 
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
         if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-            )
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero or self.use_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.use_ada_layer_norm_single:
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
 
-            # 2. Cross-Attention
             attn_output = self.attn2(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
@@ -381,12 +514,17 @@ def forward(
             )
             hidden_states = attn_output + hidden_states
 
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
+        # 4. Feed-forward
+        if not self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm3(hidden_states)
 
         if self.use_ada_layer_norm_zero:
             norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
 
+        if self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
         if self._chunk_size is not None:
             # "feed_forward_chunk_size" can be used to save memory
             if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
@@ -396,16 +534,23 @@ def forward(
 
             num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
             ff_output = paddle.concat(
-                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, axis=self._chunk_dim)],
+                [
+                    self.ff(hid_slice, scale=lora_scale)
+                    for hid_slice in norm_hidden_states.chunk(num_chunks, axis=self._chunk_dim)
+                ],
                 axis=self._chunk_dim,
             )
         else:
-            ff_output = self.ff(norm_hidden_states)
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
 
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.use_ada_layer_norm_single:
+            ff_output = gate_mlp * ff_output
 
         hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
 
         return hidden_states
 
@@ -456,137 +601,10 @@ def __init__(
         if final_dropout:
             self.net.append(nn.Dropout(dropout))
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, scale: float = 1.0):
         for module in self.net:
-            hidden_states = module(hidden_states)
-        return hidden_states
-
-
-class GELU(nn.Layer):
-    r"""
-    GELU activation function with tanh approximation support with `approximate="tanh"`.
-    """
-
-    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
-        super().__init__()
-        self.proj = LoRACompatibleLinear(dim_in, dim_out)
-        self.approximate = approximate
-        self.approximate_bool = approximate == "tanh"
-
-    def forward(self, hidden_states):
-        hidden_states = self.proj(hidden_states)
-        hidden_states = F.gelu(hidden_states, approximate=self.approximate_bool)
+            if isinstance(module, (LoRACompatibleLinear, GEGLU)):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
         return hidden_states
-
-
-class GEGLU(nn.Layer):
-    r"""
-    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
-
-    Parameters:
-        dim_in (`int`): The number of channels in the input.
-        dim_out (`int`): The number of channels in the output.
-    """
-
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = LoRACompatibleLinear(dim_in, dim_out * 2)
-
-    def forward(self, hidden_states):
-        hidden_states, gate = self.proj(hidden_states).chunk(2, axis=-1)
-        return hidden_states * F.gelu(gate)
-
-
-class ApproximateGELU(nn.Layer):
-    """
-    The approximate form of Gaussian Error Linear Unit (GELU)
-
-    For more details, see section 2: https://arxiv.org/abs/1606.08415
-    """
-
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out)
-
-    def forward(self, x):
-        x = self.proj(x)
-        return x * F.sigmoid(1.702 * x)
-
-
-class AdaLayerNorm(nn.Layer):
-    """
-    Norm layer modified to incorporate timestep embeddings.
-    """
-
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-        self.emb = nn.Embedding(num_embeddings, embedding_dim)
-        self.silu = nn.Silu()
-        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
-        # elementwise_affine=False
-        norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.norm = nn.LayerNorm(embedding_dim, **norm_kwargs)
-
-    def forward(self, x, timestep):
-        emb = self.linear(self.silu(self.emb(timestep)))
-        # must set axis=-1, paddle vs pytorch
-        scale, shift = paddle.chunk(emb, 2, axis=-1)
-        x = self.norm(x) * (1 + scale) + shift
-        return x
-
-
-class AdaLayerNormZero(nn.Layer):
-    """
-    Norm layer adaptive layer norm zero (adaLN-Zero).
-    """
-
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-
-        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
-
-        self.silu = nn.Silu()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
-        # elementwise_affine=False
-        norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
-
-    def forward(self, x, timestep, class_labels, hidden_dtype=None):
-        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
-
-
-class AdaGroupNorm(nn.Layer):
-    """
-    GroupNorm layer modified to incorporate timestep embeddings.
-    """
-
-    def __init__(
-        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
-    ):
-        super().__init__()
-        self.num_groups = num_groups
-        self.eps = eps
-        if act_fn is None:
-            self.act = None
-        else:
-            self.act = get_activation(act_fn)
-
-        self.linear = nn.Linear(embedding_dim, out_dim * 2)
-        # elementwise_affine=False
-        norm_kwargs = {"weight_attr": False, "bias_attr": False}
-        self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
-        self.group_norm.weight = None
-        self.group_norm.bias = None
-
-    def forward(self, x, emb):
-        if self.act:
-            emb = self.act(emb)
-        emb = self.linear(emb)
-        emb = emb[:, :, None, None]
-        scale, shift = emb.chunk(2, axis=1)
-        x = self.group_norm(x)
-        x = x * (1 + scale) + shift
-        return x
diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
index 0ed4e1e5d..e7789de33 100644
--- a/ppdiffusers/ppdiffusers/models/attention_processor.py
+++ b/ppdiffusers/ppdiffusers/models/attention_processor.py
@@ -12,14 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from importlib import import_module
 from typing import Optional, Union
 
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from ..utils import deprecate, is_ppxformers_available, logging
-from .lora import LoRALinearLayer
+from ..utils import USE_PEFT_BACKEND, deprecate, is_ppxformers_available, logging
+from .lora import LoRACompatibleLinear, LoRALinearLayer
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -29,14 +30,50 @@ class Attention(nn.Layer):
     A cross attention layer.
 
     Parameters:
-        query_dim (`int`): The number of channels in the query.
+        query_dim (`int`):
+            The number of channels in the query.
         cross_attention_dim (`int`, *optional*):
             The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
-        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
-        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
         bias (`bool`, *optional*, defaults to False):
             Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
     """
 
     def __init__(
@@ -46,7 +83,7 @@ def __init__(
         heads: int = 8,
         dim_head: int = 64,
         dropout: float = 0.0,
-        bias=False,
+        bias: bool = False,
         upcast_attention: bool = False,
         upcast_softmax: bool = False,
         cross_attention_norm: Optional[str] = None,
@@ -60,12 +97,12 @@ def __init__(
         eps: float = 1e-5,
         rescale_output_factor: float = 1.0,
         residual_connection: bool = False,
-        _from_deprecated_attn_block=False,
+        _from_deprecated_attn_block: bool = False,
         processor: Optional["AttnProcessor"] = None,
     ):
         super().__init__()
-        inner_dim = dim_head * heads
-        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.inner_dim = dim_head * heads
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
         self.upcast_attention = upcast_attention
         self.upcast_softmax = upcast_softmax
         self.rescale_output_factor = rescale_output_factor
@@ -107,7 +144,7 @@ def __init__(
         if cross_attention_norm is None:
             self.norm_cross = None
         elif cross_attention_norm == "layer_norm":
-            self.norm_cross = nn.LayerNorm(cross_attention_dim)
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
         elif cross_attention_norm == "group_norm":
             if self.added_kv_proj_dim is not None:
                 # The given `encoder_hidden_states` are initially of shape
@@ -117,7 +154,7 @@ def __init__(
                 # the number of channels for the group norm.
                 norm_cross_num_channels = added_kv_proj_dim
             else:
-                norm_cross_num_channels = cross_attention_dim
+                norm_cross_num_channels = self.cross_attention_dim
 
             self.norm_cross = nn.GroupNorm(
                 num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, epsilon=1e-5
@@ -127,22 +164,22 @@ def __init__(
                 f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
             )
 
-        self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=bias)
+        self.to_q = LoRACompatibleLinear(query_dim, self.inner_dim, bias_attr=bias)
 
         if not self.only_cross_attention:
             # only relevant for the `AddedKVProcessor` classes
-            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
-            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
+            self.to_k = LoRACompatibleLinear(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
+            self.to_v = LoRACompatibleLinear(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
         else:
             self.to_k = None
             self.to_v = None
 
         if self.added_kv_proj_dim is not None:
-            self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
-            self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+            self.add_k_proj = LoRACompatibleLinear(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = LoRACompatibleLinear(added_kv_proj_dim, self.inner_dim)
 
         self.to_out = nn.LayerList([])
-        self.to_out.append(nn.Linear(inner_dim, query_dim, bias_attr=out_bias))
+        self.to_out.append(LoRACompatibleLinear(self.inner_dim, query_dim, bias_attr=out_bias))
         self.to_out.append(nn.Dropout(dropout))
 
         # set attention processor
@@ -152,7 +189,17 @@ def __init__(
 
     def set_use_memory_efficient_attention_xformers(
         self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
-    ):
+    ) -> None:
+        r"""
+        Set whether to use memory efficient attention from `xformers` or not.
+
+        Args:
+            use_memory_efficient_attention_xformers (`bool`):
+                Whether to use memory efficient attention from `xformers` or not.
+            attention_op (`Callable`, *optional*):
+                The attention operation to use. Defaults to `None` which uses the default attention operation from
+                `xformers`.
+        """
         is_lora = hasattr(self, "processor") and isinstance(
             self.processor,
             LORA_ATTENTION_PROCESSORS,
@@ -170,6 +217,7 @@ def set_use_memory_efficient_attention_xformers(
                 LoRAAttnAddedKVProcessor,
             ),
         )
+
         if use_memory_efficient_attention_xformers:
             if is_added_kv_processor and (is_lora or is_custom_diffusion):
                 raise NotImplementedError(
@@ -252,6 +300,13 @@ def set_use_memory_efficient_attention_xformers(
         self.set_processor(processor)
 
     def set_attention_slice(self, slice_size):
+        r"""
+        Set the slice size for attention computation.
+
+        Args:
+            slice_size (`int`):
+                The slice size for attention computation.
+        """
         if slice_size is not None and slice_size > self.sliceable_head_dim:
             raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
 
@@ -266,7 +321,32 @@ def set_attention_slice(self, slice_size):
 
         self.set_processor(processor)
 
-    def set_processor(self, processor: "AttnProcessor"):
+    def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False) -> None:
+        r"""
+        Set the attention processor to use.
+
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+            _remove_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to remove LoRA layers from the model.
+        """
+        if (
+            hasattr(self, "processor")
+            and not isinstance(processor, LORA_ATTENTION_PROCESSORS)
+            and self.to_q.lora_layer is not None
+        ):
+            deprecate(
+                "set_processor to offload LoRA",
+                "0.26.0",
+                "In detail, removing LoRA layers via calling `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.",
+            )
+            # TODO(Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete
+            # We need to remove all LoRA layers
+            for module in self.sublayers():
+                if hasattr(module, "set_lora_layer"):
+                    module.set_lora_layer(None)
+
         # if current processor is in `self._sub_layers` and if passed `processor` is not, we need to
         # pop `processor` from `self._sub_layers`
         if hasattr(self, "processor") and isinstance(self.processor, nn.Layer) and not isinstance(processor, nn.Layer):
@@ -275,7 +355,113 @@ def set_processor(self, processor: "AttnProcessor"):
 
         self.processor = processor
 
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        r"""
+        Get the attention processor in use.
+
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+
+        # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
+        # serialization format for LoRA Attention Processors. It should be deleted once the integration
+        # with PEFT is completed.
+        is_lora_activated = {
+            name: module.lora_layer is not None
+            for name, module in self.named_sublayers()
+            if hasattr(module, "lora_layer")
+        }
+
+        # 1. if no layer has a LoRA activated we can return the processor as usual
+        if not any(is_lora_activated.values()):
+            return self.processor
+
+        # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
+        is_lora_activated.pop("add_k_proj", None)
+        is_lora_activated.pop("add_v_proj", None)
+        # 2. else it is not posssible that only some layers have LoRA activated
+        if not all(is_lora_activated.values()):
+            raise ValueError(
+                f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
+            )
+
+        # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
+        non_lora_processor_cls_name = self.processor.__class__.__name__
+        lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)
+
+        hidden_size = self.inner_dim
+
+        # now create a LoRA attention processor from the LoRA layers
+        if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_5, LoRAXFormersAttnProcessor]:
+            kwargs = {
+                "cross_attention_dim": self.cross_attention_dim,
+                "rank": self.to_q.lora_layer.rank,
+                "network_alpha": self.to_q.lora_layer.network_alpha,
+                "q_rank": self.to_q.lora_layer.rank,
+                "q_hidden_size": self.to_q.lora_layer.out_features,
+                "k_rank": self.to_k.lora_layer.rank,
+                "k_hidden_size": self.to_k.lora_layer.out_features,
+                "v_rank": self.to_v.lora_layer.rank,
+                "v_hidden_size": self.to_v.lora_layer.out_features,
+                "out_rank": self.to_out[0].lora_layer.rank,
+                "out_hidden_size": self.to_out[0].lora_layer.out_features,
+            }
+
+            if hasattr(self.processor, "attention_op"):
+                kwargs["attention_op"] = self.prcoessor.attention_op
+
+            lora_processor = lora_processor_cls(hidden_size, **kwargs)
+            lora_processor.to_q_lora.load_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_dict(self.to_out[0].lora_layer.state_dict())
+        elif lora_processor_cls == LoRAAttnAddedKVProcessor:
+            lora_processor = lora_processor_cls(
+                hidden_size,
+                cross_attention_dim=self.add_k_proj.weight.shape[0],
+                rank=self.to_q.lora_layer.rank,
+                network_alpha=self.to_q.lora_layer.network_alpha,
+            )
+            lora_processor.to_q_lora.load_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_dict(self.to_out[0].lora_layer.state_dict())
+
+            # only save if used
+            if self.add_k_proj.lora_layer is not None:
+                lora_processor.add_k_proj_lora.load_dict(self.add_k_proj.lora_layer.state_dict())
+                lora_processor.add_v_proj_lora.load_dict(self.add_v_proj.lora_layer.state_dict())
+            else:
+                lora_processor.add_k_proj_lora = None
+                lora_processor.add_v_proj_lora = None
+        else:
+            raise ValueError(f"{lora_processor_cls} does not exist.")
+
+        return lora_processor
+
     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+        r"""
+        The forward method of the `Attention` class.
+
+        Args:
+            hidden_states (`paddle.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`paddle.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`paddle.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+
+        Returns:
+            `paddle.Tensor`: The output of the attention layer.
+        """
         # The `Attention` class can call different attention processors / attention functions
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -332,7 +518,7 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None,
         if batch_size is None:
             deprecate(
                 "batch_size=None",
-                "0.0.15",
+                "0.22.0",
                 message=(
                     "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
                     " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
@@ -394,116 +580,12 @@ def __call__(
         encoder_hidden_states=None,
         attention_mask=None,
         temb=None,
-        **cross_attention_kwargs
-    ):
-        if attn.residual_connection:
-            residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class LoRAAttnProcessor(nn.Layer):
-    r"""
-    Processor for implementing the LoRA attention mechanism.
-
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-    """
-
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None, **kwargs):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-
-        q_rank = kwargs.pop("q_rank", None)
-        q_hidden_size = kwargs.pop("q_hidden_size", None)
-        q_rank = q_rank if q_rank is not None else rank
-        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
-
-        v_rank = kwargs.pop("v_rank", None)
-        v_hidden_size = kwargs.pop("v_hidden_size", None)
-        v_rank = v_rank if v_rank is not None else rank
-        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
-
-        out_rank = kwargs.pop("out_rank", None)
-        out_hidden_size = kwargs.pop("out_hidden_size", None)
-        out_rank = out_rank if out_rank is not None else rank
-        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
-
-        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
         scale=1.0,
-        temb=None,
         **cross_attention_kwargs
     ):
-        if attn.residual_connection:
-            residual = hidden_states
+        residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
 
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -522,17 +604,17 @@ def __call__(
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
 
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query)
+        query = attn.to_q(hidden_states, *args)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
 
+        query = attn.head_to_batch_dim(query)
         key = attn.head_to_batch_dim(key)
         value = attn.head_to_batch_dim(value)
 
@@ -541,7 +623,7 @@ def __call__(
         hidden_states = attn.batch_to_head_dim(hidden_states)
 
         # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states, *args)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
@@ -577,12 +659,12 @@ class CustomDiffusionAttnProcessor(nn.Layer):
 
     def __init__(
         self,
-        train_kv=True,
-        train_q_out=True,
-        hidden_size=None,
-        cross_attention_dim=None,
-        out_bias=True,
-        dropout=0.0,
+        train_kv: bool = True,
+        train_q_out: bool = True,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
     ):
         super().__init__()
         self.train_kv = train_kv
@@ -664,87 +746,6 @@ class AttnAddedKVProcessor:
     encoder.
     """
 
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
-    ):
-        residual = hidden_states
-        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
-            [0, 2, 1]
-        )
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query)
-
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
-            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = paddle.matmul(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-
-        return hidden_states
-
-
-class LoRAAttnAddedKVProcessor(nn.Layer):
-    r"""
-    Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
-    encoder.
-
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*, defaults to `None`):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-
-    """
-
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-
     def __call__(
         self,
         attn: Attention,
@@ -755,6 +756,9 @@ def __call__(
         **cross_attention_kwargs
     ):
         residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+
         hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
             [0, 2, 1]
         )
@@ -769,25 +773,21 @@ def __call__(
 
         hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
 
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.to_q(hidden_states, *args)
         query = attn.head_to_batch_dim(query)
 
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
-            encoder_hidden_states
-        )
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
-            encoder_hidden_states
-        )
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states, scale=scale)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states, scale=scale)
         encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
         encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
 
         if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
-            value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
+            key = attn.to_k(hidden_states, *args)
+            value = attn.to_v(hidden_states, *args)
             key = attn.head_to_batch_dim(key)
             value = attn.head_to_batch_dim(value)
-            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
-            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
+            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
+            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
         else:
             key = encoder_hidden_states_key_proj
             value = encoder_hidden_states_value_proj
@@ -797,7 +797,7 @@ def __call__(
         hidden_states = attn.batch_to_head_dim(hidden_states)
 
         # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states, *args)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
@@ -906,10 +906,12 @@ def __call__(
         encoder_hidden_states: Optional[paddle.Tensor] = None,
         attention_mask: Optional[paddle.Tensor] = None,
         temb: Optional[paddle.Tensor] = None,
+        scale: float = 1.0,
         **cross_attention_kwargs
     ):
-        if attn.residual_connection:
-            residual = hidden_states
+        residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
 
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -938,15 +940,15 @@ def __call__(
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
 
-        query = attn.to_q(hidden_states)
+        query = attn.to_q(hidden_states, *args)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
 
         # if transpose = False, query's shape will be [batch_size, seq_len, num_head, head_dim]
         query = attn.head_to_batch_dim(query, transpose=False)
@@ -967,7 +969,7 @@ def __call__(
         # hidden_states = hidden_states.cast(query.dtype)
         hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
         # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[0](hidden_states, *args)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
@@ -982,137 +984,9 @@ def __call__(
         return hidden_states
 
 
-class LoRAXFormersAttnProcessor(nn.Layer):
+class CustomDiffusionXFormersAttnProcessor(nn.Layer):
     r"""
-    Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
-
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        attention_op (`Callable`, *optional*, defaults to `None`):
-            The base
-            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
-            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
-            operator.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        cross_attention_dim,
-        rank=4,
-        attention_op: Optional[str] = None,
-        network_alpha=None,
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-        self.attention_op = attention_op
-
-        q_rank = kwargs.pop("q_rank", None)
-        q_hidden_size = kwargs.pop("q_hidden_size", None)
-        q_rank = q_rank if q_rank is not None else rank
-        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
-
-        v_rank = kwargs.pop("v_rank", None)
-        v_hidden_size = kwargs.pop("v_hidden_size", None)
-        v_rank = v_rank if v_rank is not None else rank
-        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
-
-        out_rank = kwargs.pop("out_rank", None)
-        out_hidden_size = kwargs.pop("out_hidden_size", None)
-        out_rank = out_rank if out_rank is not None else rank
-        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
-
-        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        scale=1.0,
-        temb=None,
-        **cross_attention_kwargs
-    ):
-        if attn.residual_connection:
-            residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query, transpose=False)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
-        key = attn.head_to_batch_dim(key, transpose=False)
-        value = attn.head_to_batch_dim(value, transpose=False)
-
-        hidden_states = F.scaled_dot_product_attention_(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            scale=attn.scale,
-            dropout_p=0.0,
-            training=attn.training,
-            attention_op=self.attention_op,
-        )
-
-        hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
-class CustomDiffusionXFormersAttnProcessor(nn.Layer):
-    r"""
-    Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
+    Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
 
     Args:
     train_kv (`bool`, defaults to `True`):
@@ -1415,34 +1289,6 @@ def __call__(
         return hidden_states
 
 
-AttnProcessor2_5 = XFormersAttnProcessor
-AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
-LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
-AttentionProcessor = Union[
-    AttnProcessor,
-    AttnProcessor2_5,
-    XFormersAttnProcessor,
-    SlicedAttnProcessor,
-    AttnAddedKVProcessor,
-    SlicedAttnAddedKVProcessor,
-    AttnAddedKVProcessor2_5,
-    XFormersAttnAddedKVProcessor,
-    LoRAAttnProcessor,
-    LoRAXFormersAttnProcessor,
-    LoRAAttnProcessor2_5,
-    LoRAAttnAddedKVProcessor,
-    CustomDiffusionAttnProcessor,
-    CustomDiffusionXFormersAttnProcessor,
-]
-
-LORA_ATTENTION_PROCESSORS = (
-    LoRAAttnProcessor,
-    LoRAAttnProcessor2_5,
-    LoRAXFormersAttnProcessor,
-    LoRAAttnAddedKVProcessor,
-)
-
-
 class SpatialNorm(nn.Layer):
     """
     Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
@@ -1468,3 +1314,414 @@ def forward(self, f, zq):
         norm_f = self.norm_layer(f)
         new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
         return new_f
+
+
+# Deprecated
+class LoRAAttnProcessor(nn.Layer):
+    r"""
+    Processor for implementing the LoRA attention mechanism.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: Optional[int] = None,
+        rank: int = 4,
+        network_alpha: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+
+        q_rank = kwargs.pop("q_rank", None)
+        q_hidden_size = kwargs.pop("q_hidden_size", None)
+        q_rank = q_rank if q_rank is not None else rank
+        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+        v_rank = kwargs.pop("v_rank", None)
+        v_hidden_size = kwargs.pop("v_hidden_size", None)
+        v_rank = v_rank if v_rank is not None else rank
+        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+        out_rank = kwargs.pop("out_rank", None)
+        out_hidden_size = kwargs.pop("out_hidden_size", None)
+        out_rank = out_rank if out_rank is not None else rank
+        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        temb=None,
+        **cross_attention_kwargs
+    ):
+        if attn.residual_connection:
+            residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = paddle.matmul(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAXFormersAttnProcessor(nn.Layer):
+    r"""
+    Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        cross_attention_dim,
+        rank=4,
+        attention_op: Optional[str] = None,
+        network_alpha=None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+        self.attention_op = attention_op
+
+        q_rank = kwargs.pop("q_rank", None)
+        q_hidden_size = kwargs.pop("q_hidden_size", None)
+        q_rank = q_rank if q_rank is not None else rank
+        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+        v_rank = kwargs.pop("v_rank", None)
+        v_hidden_size = kwargs.pop("v_hidden_size", None)
+        v_rank = v_rank if v_rank is not None else rank
+        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+        out_rank = kwargs.pop("out_rank", None)
+        out_hidden_size = kwargs.pop("out_hidden_size", None)
+        out_rank = out_rank if out_rank is not None else rank
+        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        temb=None,
+        **cross_attention_kwargs
+    ):
+        if attn.residual_connection:
+            residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query, transpose=False)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+
+        key = attn.head_to_batch_dim(key, transpose=False)
+        value = attn.head_to_batch_dim(value, transpose=False)
+
+        hidden_states = F.scaled_dot_product_attention_(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            scale=attn.scale,
+            dropout_p=0.0,
+            training=attn.training,
+            attention_op=self.attention_op,
+        )
+
+        hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAAttnAddedKVProcessor(nn.Layer):
+    r"""
+    Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
+    encoder.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: Optional[int] = None,
+        rank: int = 4,
+        network_alpha: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        scale=1.0,
+        **cross_attention_kwargs
+    ):
+        residual = hidden_states
+        hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+            [0, 2, 1]
+        )
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
+            encoder_hidden_states
+        )
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
+            encoder_hidden_states
+        )
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
+            value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
+            value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = paddle.matmul(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+AttnProcessor2_0 = XFormersAttnProcessor
+AttnAddedKVProcessor2_0 = XFormersAttnAddedKVProcessor
+LoRAAttnProcessor2_0 = LoRAXFormersAttnProcessor
+AttnProcessor2_5 = XFormersAttnProcessor
+AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
+LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
+
+LORA_ATTENTION_PROCESSORS = (
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAAttnProcessor2_5,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnAddedKVProcessor,
+)
+
+ADDED_KV_ATTENTION_PROCESSORS = (
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    AttnAddedKVProcessor2_5,
+    XFormersAttnAddedKVProcessor,
+    LoRAAttnAddedKVProcessor,
+)
+
+CROSS_ATTENTION_PROCESSORS = (
+    AttnProcessor,
+    AttnProcessor2_0,
+    AttnProcessor2_5,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAAttnProcessor2_5,
+    LoRAXFormersAttnProcessor,
+)
+
+AttentionProcessor = Union[
+    AttnProcessor,
+    AttnProcessor2_0,
+    AttnProcessor2_5,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    AttnAddedKVProcessor2_5,
+    XFormersAttnAddedKVProcessor,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    # deprecated
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAAttnProcessor2_5,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnAddedKVProcessor,
+]
+
+LORA_ATTENTION_PROCESSORS = (
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAAttnProcessor2_5,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnAddedKVProcessor,
+)
diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index 4331e9e70..6391fec12 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -64,17 +64,22 @@ def get_timestep_embedding(
     return emb
 
 
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+def get_2d_sincos_pos_embed(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
+):
     """
     grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
     [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
     """
-    grid_h = np.arange(grid_size, dtype=np.float32)
-    grid_w = np.arange(grid_size, dtype=np.float32)
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
     grid = np.meshgrid(grid_w, grid_h)  # here w goes first
     grid = np.stack(grid, axis=0)
 
-    grid = grid.reshape([2, 1, grid_size, grid_size])
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
     pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
     if cls_token and extra_tokens > 0:
         pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
@@ -127,6 +132,7 @@ def __init__(
         layer_norm=False,
         flatten=True,
         bias=True,
+        interpolation_scale=1,
         add_pos_embed=True,
     ):
         super().__init__()
@@ -144,19 +150,47 @@ def __init__(
         else:
             self.norm = None
 
+        self.patch_size = patch_size
+        # See:
+        # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+        self.interpolation_scale = interpolation_scale
         self.add_pos_embed = add_pos_embed
         if add_pos_embed:
-            pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim,
+                int(num_patches**0.5),
+                base_size=self.base_size,
+                interpolation_scale=self.interpolation_scale,
+            )
             self.register_buffer(
                 "pos_embed", paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0), persistable=False
             )
 
     def forward(self, latent):
+        height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+
         latent = self.proj(latent)
         if self.flatten:
             latent = latent.flatten(2).transpose([0, 2, 1])  # BCHW -> BNC
         if self.layer_norm:
             latent = self.norm(latent)
+
+        # Interpolate positional embeddings if needed.
+        # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+        if self.height != height or self.width != width:
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim=self.pos_embed.shape[-1],
+                grid_size=(height, width),
+                base_size=self.base_size,
+                interpolation_scale=self.interpolation_scale,
+            )
+            pos_embed = paddle.to_tensor(pos_embed)
+            pos_embed = pos_embed.cast("float32").unsqueeze(0)
+        else:
+            pos_embed = self.pos_embed
+
         if self.add_pos_embed:
             return latent + self.pos_embed
         else:
@@ -259,6 +293,33 @@ def forward(self, x):
         return out
 
 
+class SinusoidalPositionalEmbedding(nn.Layer):
+    """Apply positional information to a sequence of embeddings.
+
+    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+    them
+
+    Args:
+        embed_dim: (int): Dimension of the positional embedding.
+        max_seq_length: Maximum sequence length to apply positional embeddings
+
+    """
+
+    def __init__(self, embed_dim: int, max_seq_length: int = 32):
+        super().__init__()
+        position = paddle.arange(max_seq_length).unsqueeze(1)
+        div_term = paddle.exp(paddle.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
+        pe = paddle.zeros(1, max_seq_length, embed_dim)
+        pe[0, :, 0::2] = paddle.sin(position * div_term)
+        pe[0, :, 1::2] = paddle.cos(position * div_term)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        _, seq_length, _ = x.shape
+        x = x + self.pe[:, :seq_length]
+        return x
+
+
 class ImagePositionalEmbeddings(nn.Layer):
     """
     Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
@@ -563,3 +624,201 @@ def shape(x):
         a = a.reshape([bs, -1, 1]).transpose([0, 2, 1])
 
         return a[:, 0, :]  # cls_token
+
+
+class FourierEmbedder(nn.Layer):
+    def __init__(self, num_freqs=64, temperature=100):
+        super().__init__()
+
+        self.num_freqs = num_freqs
+        self.temperature = temperature
+
+        freq_bands = temperature ** (paddle.arange(num_freqs) / num_freqs)
+        freq_bands = freq_bands[None, None, None]
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+
+    def __call__(self, x):
+        x = self.freq_bands * x.unsqueeze(-1)
+        return paddle.stack((x.sin(), x.cos()), dim=-1).transpose([0, 1, 3, 4, 2]).reshape([*x.shape[:2], -1])
+
+
+class PositionNet(nn.Layer):
+    def __init__(self, positive_len, out_dim, feature_type="text-only", fourier_freqs=8):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+
+        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
+        self.position_dim = fourier_freqs * 2 * 4  # 2: sin/cos, 4: xyxy
+
+        if isinstance(out_dim, tuple):
+            out_dim = out_dim[0]
+
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            tmp_0 = paddle.zeros([self.positive_len])
+            self.null_positive_feature = self.create_parameter(
+                shape=tmp_0.shape, default_initializer=paddle.nn.initializer.Assign(tmp_0)
+            )
+
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            tmp_0 = paddle.zeros([self.positive_len])
+            self.null_text_feature = self.create_parameter(
+                shape=tmp_0.shape, default_initializer=paddle.nn.initializer.Assign(tmp_0)
+            )
+            tmp_1 = paddle.zeros([self.positive_len])
+            self.null_image_feature = self.create_parameter(
+                shape=tmp_1.shape, default_initializer=paddle.nn.initializer.Assign(tmp_1)
+            )
+
+        tmp = paddle.zeros([self.position_dim])
+        self.null_position_feature = self.create_parameter(
+            shape=tmp.shape, default_initializer=paddle.nn.initializer.Assign(tmp)
+        )
+
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+        phrases_masks=None,
+        image_masks=None,
+        phrases_embeddings=None,
+        image_embeddings=None,
+    ):
+        masks = masks.unsqueeze(-1)
+
+        # embedding position (it may includes padding as placeholder)
+        xyxy_embedding = self.fourier_embedder(boxes)  # B*N*4 -> B*N*C
+
+        # learnable null embedding
+        xyxy_null = self.null_position_feature.view(1, 1, -1)
+
+        # replace padding with learnable null embedding
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+
+        # positionet with text only information
+        if positive_embeddings is not None:
+            # learnable null embedding
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+
+            objs = self.linears(paddle.concat([positive_embeddings, xyxy_embedding], axis=-1))
+
+        # positionet with text and image infomation
+        else:
+            phrases_masks = phrases_masks.unsqueeze(-1)
+            image_masks = image_masks.unsqueeze(-1)
+
+            # learnable null embedding
+            text_null = self.null_text_feature.view(1, 1, -1)
+            image_null = self.null_image_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
+            image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
+
+            objs_text = self.linears_text(paddle.concat([phrases_embeddings, xyxy_embedding], axis=-1))
+            objs_image = self.linears_image(paddle.concat([image_embeddings, xyxy_embedding], axis=-1))
+            objs = paddle.concat([objs_text, objs_image], axis=1)
+
+        return objs
+
+
+class CombinedTimestepSizeEmbeddings(nn.Layer):
+    """
+    For PixArt-Alpha.
+
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.outdim = size_emb_dim
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.use_additional_conditions = use_additional_conditions
+        if use_additional_conditions:
+            self.use_additional_conditions = True
+            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+
+    def apply_condition(self, size: paddle.Tensor, batch_size: int, embedder: nn.Layer):
+        if size.ndim == 1:
+            size = size[:, None]
+
+        if size.shape[0] != batch_size:
+            size = size.tile([batch_size // size.shape[0], 1])
+            if size.shape[0] != batch_size:
+                raise ValueError(f"`batch_size` should be {size.shape[0]} but found {batch_size}.")
+
+        current_batch_size, dims = size.shape[0], size.shape[1]
+        size = size.reshape([-1])
+        size_freq = self.additional_condition_proj(size).to(size.dtype)
+
+        size_emb = embedder(size_freq)
+        size_emb = size_emb.reshape([current_batch_size, dims * self.outdim])
+        return size_emb
+
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        if self.use_additional_conditions:
+            resolution = self.apply_condition(resolution, batch_size=batch_size, embedder=self.resolution_embedder)
+            aspect_ratio = self.apply_condition(
+                aspect_ratio, batch_size=batch_size, embedder=self.aspect_ratio_embedder
+            )
+            conditioning = timesteps_emb + paddle.concat([resolution, aspect_ratio], axis=1)
+        else:
+            conditioning = timesteps_emb
+
+        return conditioning
+
+
+class CaptionProjection(nn.Layer):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_features, hidden_size, num_tokens=120):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        self.act_1 = nn.GELU(approximate=True)
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
+        tmp_0 = paddle.randn(num_tokens, in_features) / in_features**0.5
+        self.register_buffer("y_embedding", self.create_parameter(tmp_0.shape, paddle.nn.initializer.Assign(tmp_0)))
+
+    def forward(self, caption, force_drop_ids=None):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/lora.py b/ppdiffusers/ppdiffusers/models/lora.py
index 5b2cd9196..45c5d4cb2 100644
--- a/ppdiffusers/ppdiffusers/models/lora.py
+++ b/ppdiffusers/ppdiffusers/models/lora.py
@@ -14,28 +14,61 @@
 
 from typing import Optional
 
+import paddle
 import paddle.nn as nn
+import paddle.nn.functional as F
 
 from ..initializer import normal_, zeros_
+from ..loaders import (
+    PatchedLoraProjection,
+    text_encoder_attn_modules,
+    text_encoder_mlp_modules,
+)
+from ..utils import logging
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
+    for _, attn_module in text_encoder_attn_modules(text_encoder):
+        if isinstance(attn_module.q_proj, PatchedLoraProjection):
+            attn_module.q_proj.lora_scale = lora_scale
+            attn_module.k_proj.lora_scale = lora_scale
+            attn_module.v_proj.lora_scale = lora_scale
+            attn_module.out_proj.lora_scale = lora_scale
+
+    for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+        if isinstance(mlp_module.linear1, PatchedLoraProjection):
+            mlp_module.linear1.lora_scale = lora_scale
+            mlp_module.linear2.lora_scale = lora_scale
 
 
 class LoRALinearLayer(nn.Layer):
+    r"""
+    A linear layer that is used with LoRA.
+
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        device (`torch.device`, `optional`, defaults to `None`):
+            The device to use for the layer's weights.
+        dtype (`torch.dtype`, `optional`, defaults to `None`):
+            The dtype to use for the layer's weights.
+    """
+
     def __init__(self, in_features, out_features, rank=4, network_alpha=None, device=None, dtype=None):
         super().__init__()
 
-        if rank > min(in_features, out_features):
-            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
-
-        self.down = nn.Linear(
-            in_features,
-            rank,
-            bias_attr=False,
-        )
-        self.up = nn.Linear(
-            rank,
-            out_features,
-            bias_attr=False,
-        )
+        self.down = nn.Linear(in_features, rank, bias_attr=False)
+        self.up = nn.Linear(rank, out_features, bias_attr=False)
         if device is not None:
             self.down.to(device=device)
             self.up.to(device=device)
@@ -47,6 +80,8 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None, device
         # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
         self.network_alpha = network_alpha
         self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
 
         normal_(self.down.weight, std=1 / rank)
         zeros_(self.up.weight)
@@ -54,6 +89,7 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None, device
     def forward(self, hidden_states):
         orig_dtype = hidden_states.dtype
         dtype = self.down.weight.dtype
+
         down_hidden_states = self.down(hidden_states.cast(dtype))
         up_hidden_states = self.up(down_hidden_states)
 
@@ -64,14 +100,33 @@ def forward(self, hidden_states):
 
 
 class LoRAConv2dLayer(nn.Layer):
+    r"""
+    A convolutional layer that is used with LoRA.
+
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        kernel_size (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The kernel size of the convolution.
+        stride (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The stride of the convolution.
+        padding (`int` or `tuple` of two `int` or `str`, `optional`, defaults to 0):
+            The padding of the convolution.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+    """
+
     def __init__(
         self, in_features, out_features, rank=4, kernel_size=(1, 1), stride=(1, 1), padding=0, network_alpha=None
     ):
         super().__init__()
 
-        if rank > min(in_features, out_features):
-            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
-
         self.down = nn.Conv2D(
             in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias_attr=False
         )
@@ -112,17 +167,78 @@ def __init__(self, *args, lora_layer: Optional[LoRAConv2dLayer] = None, **kwargs
     def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
         self.lora_layer = lora_layer
 
-    def forward(self, x):
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+
+        dtype = self.weight.dtype
+
+        w_orig = self.weight.astype(paddle.get_default_dtype())
+        w_up = self.lora_layer.up.weight.astype(paddle.get_default_dtype())
+        w_down = self.lora_layer.down.weight.astype(paddle.get_default_dtype())
+
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+
+        fusion = paddle.mm(w_up.flatten(start_axis=1), w_down.flatten(start_axis=1))
+        fusion = fusion.reshape((w_orig.shape))
+        fused_weight = w_orig + (lora_scale * fusion)
+
+        if safe_fusing and paddle.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        out_0 = fused_weight.cast(dtype=dtype)
+        self.weight = self.create_parameter(
+            shape=out_0.shape,
+            default_initializer=nn.initializer.Assign(out_0),
+        )
+
+        # we can drop the lora layer now
+        self.lora_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.weight
+        dtype = fused_weight.dtype
+
+        self.w_up = self.w_up.astype(paddle.get_default_dtype())
+        self.w_down = self.w_down.astype(paddle.get_default_dtype())
+
+        fusion = paddle.mm(self.w_up.flatten(start_axis=1), self.w_down.flatten(start_axis=1))
+        fusion = fusion.reshape((fused_weight.shape))
+        unfused_weight = fused_weight.astype(paddle.get_default_dtype()) - (self._lora_scale * fusion)
+        out_0 = unfused_weight.cast(dtype=dtype)
+        self.weight = self.create_parameter(
+            shape=out_0.shape,
+            default_initializer=nn.initializer.Assign(out_0),
+        )
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, hidden_states, scale: float = 1.0):
         if self.lora_layer is None:
             # make sure to the functional Conv2D function as otherwise torch.compile's graph will break
             # see: https://github.com/huggingface/diffusers/pull/4315
-            return nn.functional.conv2d(
-                x, self.weight, self.bias, self._stride, self._padding, self._dilation, self._groups
+            return F.conv2d(
+                hidden_states, self.weight, self.bias, self._stride, self._padding, self._dilation, self._groups
             )
-            # return super().forward(x)
-            # return nn.functional.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
         else:
-            return super().forward(x) + self.lora_layer(x)
+            original_outputs = F.conv2d(
+                hidden_states, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+            )
+            return original_outputs + (scale * self.lora_layer(hidden_states))
 
 
 class LoRACompatibleLinear(nn.Linear):
@@ -134,17 +250,76 @@ def __init__(self, *args, lora_layer: Optional[LoRALinearLayer] = None, **kwargs
         super().__init__(*args, **kwargs)
         self.lora_layer = lora_layer
 
-    def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
+    def set_lora_layer(self, lora_layer: Optional[LoRALinearLayer]):
         self.lora_layer = lora_layer
 
-    def forward(self, x):
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+
+        dtype = self.weight.dtype
+
+        w_orig = self.weight.astype(paddle.get_default_dtype())
+        w_up = self.lora_layer.up.weight.astype(paddle.get_default_dtype())
+        w_down = self.lora_layer.down.weight.astype(paddle.get_default_dtype())
+
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+
+        fused_weight = w_orig + (lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
+
+        if safe_fusing and paddle.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        out_0 = fused_weight.cast(dtype=dtype)
+        self.weight = self.create_parameter(
+            shape=out_0.shape,
+            default_initializer=nn.initializer.Assign(out_0),
+        )
+
+        # we can drop the lora layer now
+        self.lora_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.weight
+        dtype = fused_weight.dtype
+
+        w_up = self.w_up.astype(paddle.get_default_dtype())
+        w_down = self.w_down.astype(paddle.get_default_dtype())
+
+        unfused_weight = (
+            fused_weight.astype(paddle.get_default_dtype())
+            - (self._lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
+        )
+        out_0 = unfused_weight.cast(dtype=dtype)
+        self.weight = self.create_parameter(
+            shape=out_0.shape,
+            default_initializer=nn.initializer.Assign(out_0),
+        )
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, hidden_states, scale: float = 1.0):
         # breakpoint()
         if self.lora_layer is None:
-            # return super().forward(x)
+            # return super().forward(hidden_states)
             return nn.functional.linear(
-                x,
+                hidden_states,
                 self.weight,
                 self.bias,
             )
         else:
-            return super().forward(x) + self.lora_layer(x)
+            return super().forward(hidden_states) + (scale * self.lora_layer(hidden_states))
diff --git a/ppdiffusers/ppdiffusers/models/normalization.py b/ppdiffusers/ppdiffusers/models/normalization.py
new file mode 100644
index 000000000..5a0726f19
--- /dev/null
+++ b/ppdiffusers/ppdiffusers/models/normalization.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+
+from .activations import get_activation
+from .embeddings import CombinedTimestepLabelEmbeddings, CombinedTimestepSizeEmbeddings
+
+
+class AdaLayerNorm(nn.Layer):
+    r"""
+    Norm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: int):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.Silu()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        # elementwise_affine=False
+        norm_kwargs = {"weight_attr": False, "bias_attr": False}
+        self.norm = nn.LayerNorm(embedding_dim, **norm_kwargs)
+
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        # must set axis=-1, paddle vs pytorch
+        scale, shift = paddle.chunk(emb, 2, axis=-1)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+
+
+class AdaLayerNormZero(nn.Layer):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: int):
+        super().__init__()
+
+        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+
+        self.silu = nn.Silu()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
+        # elementwise_affine=False
+        norm_kwargs = {"weight_attr": False, "bias_attr": False}
+        self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
+
+    def forward(self, x, timestep, class_labels, hidden_dtype=None):
+        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+class AdaLayerNormSingle(nn.Layer):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.emb = CombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+
+        self.silu = nn.Silu()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+
+    def forward(
+        self,
+        timestep: paddle.Tensor,
+        added_cond_kwargs: Dict[str, paddle.Tensor] = None,
+        batch_size: int = None,
+        hidden_dtype: Optional[paddle.dtype] = None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+
+
+class AdaGroupNorm(nn.Layer):
+    r"""
+    GroupNorm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+        num_groups (`int`): The number of groups to separate the channels into.
+        act_fn (`str`, *optional*, defaults to `None`): The activation function to use.
+        eps (`float`, *optional*, defaults to `1e-5`): The epsilon value to use for numerical stability.
+    """
+
+    def __init__(
+        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
+    ):
+        super().__init__()
+        self.num_groups = num_groups
+        self.eps = eps
+
+        if act_fn is None:
+            self.act = None
+        else:
+            self.act = get_activation(act_fn)
+
+        self.linear = nn.Linear(embedding_dim, out_dim * 2)
+        # elementwise_affine=False
+        norm_kwargs = {"weight_attr": False, "bias_attr": False}
+        self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
+        self.group_norm.weight = None
+        self.group_norm.bias = None
+
+    def forward(self, x: paddle.Tensor, emb: paddle.Tensor) -> paddle.Tensor:
+        if self.act:
+            emb = self.act(emb)
+        emb = self.linear(emb)
+        emb = emb[:, :, None, None]
+        scale, shift = emb.chunk(2, axis=1)
+        x = self.group_norm(x)
+        x = x * (1 + scale) + shift
+        return x
diff --git a/ppdiffusers/ppdiffusers/models/prior_transformer.py b/ppdiffusers/ppdiffusers/models/prior_transformer.py
index d8f0b97d6..f10fa2596 100644
--- a/ppdiffusers/ppdiffusers/models/prior_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/prior_transformer.py
@@ -21,9 +21,16 @@
 import paddle.nn.functional as F
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import NEG_INF, BaseOutput
 from .attention import BasicTransformerBlock
-from .attention_processor import AttentionProcessor, AttnProcessor
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 
@@ -41,7 +48,7 @@ class PriorTransformerOutput(BaseOutput):
     predicted_image_embedding: paddle.Tensor
 
 
-class PriorTransformer(ModelMixin, ConfigMixin):
+class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     """
     A Prior Transformer model.
 
@@ -197,8 +204,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         processors = {}
 
         def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
 
             for sub_name, child in module.named_children():
                 fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -211,7 +218,9 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
         return processors
 
     # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
         r"""
         Sets the attention processor to use to compute attention.
         Parameters:
@@ -232,9 +241,9 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
         def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
             if hasattr(module, "set_processor"):
                 if not isinstance(processor, dict):
-                    module.set_processor(processor)
+                    module.set_processor(processor, _remove_lora=_remove_lora)
                 else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
 
             for sub_name, child in module.named_children():
                 fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -247,7 +256,16 @@ def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
         """
-        self.set_attn_processor(AttnProcessor())
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
 
     def forward(
         self,
diff --git a/ppdiffusers/ppdiffusers/models/resnet.py b/ppdiffusers/ppdiffusers/models/resnet.py
index 69a5cb502..168b828b5 100644
--- a/ppdiffusers/ppdiffusers/models/resnet.py
+++ b/ppdiffusers/ppdiffusers/models/resnet.py
@@ -23,9 +23,9 @@
 
 from ..initializer import zeros_
 from .activations import get_activation
-from .attention import AdaGroupNorm
 from .attention_processor import SpatialNorm
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
+from .normalization import AdaGroupNorm
 
 
 class Upsample1D(nn.Layer):
@@ -40,6 +40,8 @@ class Upsample1D(nn.Layer):
             option to use a convolution transpose.
         out_channels (`int`, optional):
             number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 1D layer.
     """
 
     def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -81,6 +83,8 @@ class Downsample1D(nn.Layer):
             number of output channels. Defaults to `channels`.
         padding (`int`, default `1`):
             padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 1D layer.
     """
 
     def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -115,6 +119,8 @@ class Upsample2D(nn.Layer):
             option to use a convolution transpose.
         out_channels (`int`, optional):
             number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
     """
 
     def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -137,7 +143,7 @@ def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_chann
         else:
             self.Conv2d_0 = conv
 
-    def forward(self, hidden_states, output_size=None):
+    def forward(self, hidden_states, output_size=None, scale: float = 1.0):
         assert hidden_states.shape[1] == self.channels
 
         if self.use_conv_transpose:
@@ -150,6 +156,10 @@ def forward(self, hidden_states, output_size=None):
         if dtype == paddle.bfloat16:
             hidden_states = hidden_states.cast("float32")
 
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous() if hasattr(hidden_states, "contiguous") else hidden_states
+
         # if `output_size` is passed we force the interpolation output
         # size and do not make use of `scale_factor=2`
         if output_size is None:
@@ -164,9 +174,15 @@ def forward(self, hidden_states, output_size=None):
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
         if self.use_conv:
             if self.name == "conv":
-                hidden_states = self.conv(hidden_states)
+                if isinstance(self.conv, LoRACompatibleConv):
+                    hidden_states = self.conv(hidden_states, scale)
+                else:
+                    hidden_states = self.conv(hidden_states)
             else:
-                hidden_states = self.Conv2d_0(hidden_states)
+                if isinstance(self.Conv2d_0, LoRACompatibleConv):
+                    hidden_states = self.Conv2d_0(hidden_states, scale)
+                else:
+                    hidden_states = self.Conv2d_0(hidden_states)
 
         return hidden_states
 
@@ -183,6 +199,8 @@ class Downsample2D(nn.Layer):
             number of output channels. Defaults to `channels`.
         padding (`int`, default `1`):
             padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
     """
 
     def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -209,14 +227,17 @@ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=
         else:
             self.conv = conv
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, scale: float = 1.0):
         assert hidden_states.shape[1] == self.channels
         if self.use_conv and self.padding == 0:
             pad = (0, 1, 0, 1)
             hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
 
         assert hidden_states.shape[1] == self.channels
-        hidden_states = self.conv(hidden_states)
+        if isinstance(self.conv, LoRACompatibleConv):
+            hidden_states = self.conv(hidden_states, scale)
+        else:
+            hidden_states = self.conv(hidden_states)
 
         return hidden_states
 
@@ -423,6 +444,12 @@ def forward(self, hidden_states):
 
 # downsample/upsample layer used in k-upscaler, might be able to use FirDownsample2D/DirUpsample2D instead
 class KDownsample2D(nn.Layer):
+    r"""A 2D K-downsampling layer.
+
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+
     def __init__(self, pad_mode="reflect"):
         super().__init__()
         self.pad_mode = pad_mode
@@ -442,6 +469,12 @@ def forward(self, inputs):
 
 
 class KUpsample2D(nn.Layer):
+    r"""A 2D K-upsampling layer.
+
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+
     def __init__(self, pad_mode="reflect"):
         super().__init__()
         self.pad_mode = pad_mode
@@ -592,7 +625,7 @@ def __init__(
                 in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias
             )
 
-    def forward(self, input_tensor, temb):
+    def forward(self, input_tensor, temb, scale: float = 1.0):
         hidden_states = input_tensor
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
@@ -603,18 +636,34 @@ def forward(self, input_tensor, temb):
         hidden_states = self.nonlinearity(hidden_states)
 
         if self.upsample is not None:
-            input_tensor = self.upsample(input_tensor)
-            hidden_states = self.upsample(hidden_states)
+            input_tensor = (
+                self.upsample(input_tensor, scale=scale)
+                if isinstance(self.upsample, Upsample2D)
+                else self.upsample(input_tensor)
+            )
+            hidden_states = (
+                self.upsample(hidden_states, scale=scale)
+                if isinstance(self.upsample, Upsample2D)
+                else self.upsample(hidden_states)
+            )
         elif self.downsample is not None:
-            input_tensor = self.downsample(input_tensor)
-            hidden_states = self.downsample(hidden_states)
+            input_tensor = (
+                self.downsample(input_tensor, scale=scale)
+                if isinstance(self.downsample, Downsample2D)
+                else self.downsample(input_tensor)
+            )
+            hidden_states = (
+                self.downsample(hidden_states, scale=scale)
+                if isinstance(self.downsample, Downsample2D)
+                else self.downsample(hidden_states)
+            )
 
-        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.conv1(hidden_states, scale)
 
         if self.time_emb_proj is not None:
             if not self.pre_temb_non_linearity and not self.skip_time_act:
                 temb = self.nonlinearity(temb)
-            temb = self.time_emb_proj(temb)[:, :, None, None]
+            temb = self.time_emb_proj(temb, scale)[:, :, None, None]
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
@@ -631,10 +680,10 @@ def forward(self, input_tensor, temb):
         hidden_states = self.nonlinearity(hidden_states)
 
         hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.conv2(hidden_states, scale)
 
         if self.conv_shortcut is not None:
-            input_tensor = self.conv_shortcut(input_tensor)
+            input_tensor = self.conv_shortcut(input_tensor, scale)
 
         # TODO this maybe result -inf, input_tensor's min value -57644  hidden_states's min value -10000
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
@@ -657,14 +706,21 @@ def rearrange_dims(tensor):
 class Conv1dBlock(nn.Layer):
     """
     Conv1d --> GroupNorm --> Mish
+
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        n_groups (`int`, default `8`): Number of groups to separate the channels into.
+        activation (`str`, defaults `mish`): Name of the activation function.
     """
 
-    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
+    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8, activation: str = "mish"):
         super().__init__()
 
         self.conv1d = nn.Conv1D(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
         self.group_norm = nn.GroupNorm(n_groups, out_channels)
-        self.mish = nn.Mish()
+        self.mish = get_activation(activation)
 
     def forward(self, inputs):
         intermediate_repr = self.conv1d(inputs)
@@ -677,12 +733,23 @@ def forward(self, inputs):
 
 # unet_rl.py
 class ResidualTemporalBlock1D(nn.Layer):
-    def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5):
+    """
+    Residual 1D block with temporal convolutions.
+
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        embed_dim (`int`): Embedding dimension.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
+    """
+
+    def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5, activation: str = "mish"):
         super().__init__()
         self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
         self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
 
-        self.time_emb_act = nn.Mish()
+        self.time_emb_act = get_activation(activation)
         self.time_emb = nn.Linear(embed_dim, out_channels)
 
         self.residual_conv = (
@@ -852,6 +919,11 @@ class TemporalConvLayer(nn.Layer):
     """
     Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
     https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
+
+    Parameters:
+        in_dim (`int`): Number of input channels.
+        out_dim (`int`): Number of output channels.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
     """
 
     def __init__(self, in_dim, out_dim=None, dropout=0.0):
@@ -883,6 +955,8 @@ def __init__(self, in_dim, out_dim=None, dropout=0.0):
             nn.Dropout(p=dropout),
             nn.Conv3D(in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), padding=(1, 0, 0)),
         )
+
+        # zero out the last layer params,so the conv block is identity
         zeros_(self.conv4[-1].weight)
         zeros_(self.conv4[-1].bias)
 
@@ -892,12 +966,15 @@ def forward(self, hidden_states, num_frames=1):
             .reshape((-1, num_frames) + tuple(hidden_states.shape[1:]))
             .transpose(perm=[0, 2, 1, 3, 4])
         )
+
         identity = hidden_states
         hidden_states = self.conv1(hidden_states)
         hidden_states = self.conv2(hidden_states)
         hidden_states = self.conv3(hidden_states)
         hidden_states = self.conv4(hidden_states)
+
         hidden_states = identity + hidden_states
+
         hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape(
             (hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(hidden_states.shape[3:])
         )
diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py
index c2f32ed42..fdaa135c0 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_2d.py
@@ -18,14 +18,16 @@
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
+from paddle.distributed.fleet.utils import recompute
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..models.embeddings import ImagePositionalEmbeddings
 from ..utils import BaseOutput, deprecate
 from .attention import BasicTransformerBlock
-from .embeddings import PatchEmbed
+from .embeddings import CaptionProjection, PatchEmbed
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
 from .modeling_utils import ModelMixin
+from .normalization import AdaLayerNormSingle
 
 
 @dataclass
@@ -89,9 +91,13 @@ def __init__(
         num_embeds_ada_norm: Optional[int] = None,
         use_linear_projection: bool = False,
         only_cross_attention: bool = False,
+        double_self_attention: bool = False,
         upcast_attention: bool = False,
         norm_type: str = "layer_norm",
         norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
     ):
         super().__init__()
         self.use_linear_projection = use_linear_projection
@@ -160,12 +166,15 @@ def __init__(
             self.width = sample_size
 
             self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
             self.pos_embed = PatchEmbed(
                 height=sample_size,
                 width=sample_size,
                 patch_size=patch_size,
                 in_channels=in_channels,
                 embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
             )
 
         # 3. Define transformers blocks
@@ -181,9 +190,12 @@ def __init__(
                     num_embeds_ada_norm=num_embeds_ada_norm,
                     attention_bias=attention_bias,
                     only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
                     upcast_attention=upcast_attention,
                     norm_type=norm_type,
                     norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
                 )
                 for d in range(num_layers)
             ]
@@ -200,18 +212,42 @@ def __init__(
         elif self.is_input_vectorized:
             self.norm_out = nn.LayerNorm(inner_dim)
             self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
-        elif self.is_input_patches:
+        elif self.is_input_patches and norm_type != "ada_norm_single":
             # elementwise_affine=False
             norm_kwargs = {"weight_attr": False, "bias_attr": False}
             self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
             self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
             self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            norm_kwargs = {"weight_attr": False, "bias_attr": False}
+            self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
+            tmp_param = paddle.randn(2, inner_dim) / inner_dim**0.5
+            self.scale_shift_table = self.create_parameter(
+                shape=tmp_param.shape, default_initializer=paddle.nn.initializer.Assign(tmp_param)
+            )
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim)
+
+        self.gradient_checkpointing = False
 
     def forward(
         self,
         hidden_states: paddle.Tensor,
         encoder_hidden_states: Optional[paddle.Tensor] = None,
         timestep: Optional[paddle.Tensor] = None,
+        added_cond_kwargs: Dict[str, paddle.Tensor] = None,
         class_labels: Optional[paddle.Tensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         attention_mask: Optional[paddle.Tensor] = None,
@@ -232,6 +268,14 @@ def forward(
             class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
                 Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                 `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `paddle.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
             encoder_attention_mask ( `paddle.Tensor`, *optional*):
                 Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
 
@@ -272,40 +316,94 @@ def forward(
             encoder_attention_mask = (1 - encoder_attention_mask.cast(hidden_states.dtype)) * -10000.0
             encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
 
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
         # 1. Input
         if self.is_input_continuous:
-            _, _, height, width = hidden_states.shape
+            batch, _, height, width = hidden_states.shape
             residual = hidden_states
             hidden_states = self.norm(hidden_states)
             if not self.use_linear_projection:
-                hidden_states = self.proj_in(hidden_states)
+                hidden_states = self.proj_in(hidden_states, scale=lora_scale)
             hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2)
             if self.use_linear_projection:
-                hidden_states = self.proj_in(hidden_states)
+                hidden_states = self.proj_in(hidden_states, scale=lora_scale)
         elif self.is_input_vectorized:
-            hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
+            # paddle original code:
+            # hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
+            hidden_states = self.latent_image_embedding(hidden_states)
         elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
             hidden_states = self.pos_embed(hidden_states)
 
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+
         # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
         for block in self.transformer_blocks:
-            hidden_states = block(
-                hidden_states,
-                attention_mask=attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                timestep=timestep,
-                cross_attention_kwargs=cross_attention_kwargs,
-                class_labels=class_labels,
-            )
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+                # hidden_states = torch.utils.checkpoint.checkpoint(
+                #     block,
+                #     hidden_states,
+                #     attention_mask,
+                #     encoder_hidden_states,
+                #     encoder_attention_mask,
+                #     timestep,
+                #     cross_attention_kwargs,
+                #     class_labels,
+                #     use_reentrant=False,
+                # )
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0] when paddlepaddle <= 2.4.1
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    # use_reentrant=False,
+                )  # [0]
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
 
         # 3. Output
         if self.is_input_continuous:
             if self.use_linear_projection:
-                hidden_states = self.proj_out(hidden_states)
-            hidden_states = hidden_states.reshape([-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
+                hidden_states = self.proj_out(hidden_states, scale=lora_scale)
+            hidden_states = hidden_states.reshape([batch, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
             if not self.use_linear_projection:
-                hidden_states = self.proj_out(hidden_states)
+                hidden_states = self.proj_out(hidden_states, scale=lora_scale)
             output = hidden_states + residual
         elif self.is_input_vectorized:
             hidden_states = self.norm_out(hidden_states)
@@ -315,17 +413,25 @@ def forward(
 
             # log(p(x_0))
             output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
-        elif self.is_input_patches:
-            # TODO: cleanup!
-            conditioning = self.transformer_blocks[0].norm1.emb(
-                timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
-            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
-            hidden_states = self.proj_out_2(hidden_states)
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, axis=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
 
             # unpatchify
-            height = width = int(hidden_states.shape[1] ** 0.5)
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
             hidden_states = hidden_states.reshape(
                 (-1, height, width, self.patch_size, self.patch_size, self.out_channels)
             )
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
index efcce95db..8ace1771a 100644
--- a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Optional
+from typing import Optional, Union
 
 import paddle
 import paddle.nn.functional as F
@@ -27,17 +27,17 @@
 class DownResnetBlock1D(nn.Layer):
     def __init__(
         self,
-        in_channels,
-        out_channels=None,
-        num_layers=1,
-        conv_shortcut=False,
-        temb_channels=32,
-        groups=32,
-        groups_out=None,
-        non_linearity=None,
-        time_embedding_norm="default",
-        output_scale_factor=1.0,
-        add_downsample=True,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        conv_shortcut: bool = False,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
     ):
         super().__init__()
         self.in_channels = in_channels
@@ -89,16 +89,16 @@ def forward(self, hidden_states, temb=None):
 class UpResnetBlock1D(nn.Layer):
     def __init__(
         self,
-        in_channels,
-        out_channels=None,
-        num_layers=1,
-        temb_channels=32,
-        groups=32,
-        groups_out=None,
-        non_linearity=None,
-        time_embedding_norm="default",
-        output_scale_factor=1.0,
-        add_upsample=True,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
     ):
         super().__init__()
         self.in_channels = in_channels
@@ -169,13 +169,13 @@ def forward(self, x, temb=None):
 class MidResTemporalBlock1D(nn.Layer):
     def __init__(
         self,
-        in_channels,
-        out_channels,
-        embed_dim,
+        in_channels: int,
+        out_channels: int,
+        embed_dim: int,
         num_layers: int = 1,
         add_downsample: bool = False,
         add_upsample: bool = False,
-        non_linearity=None,
+        non_linearity: Optional[str] = None,
     ):
         super().__init__()
         self.in_channels = in_channels
@@ -220,7 +220,7 @@ def forward(self, hidden_states, temb):
 
 
 class OutConv1DBlock(nn.Layer):
-    def __init__(self, num_groups_out, out_channels, embed_dim, act_fn):
+    def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_fn: str):
         super().__init__()
         self.final_conv1d_1 = nn.Conv1D(embed_dim, embed_dim, 5, padding=2)
         self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
@@ -238,7 +238,7 @@ def forward(self, hidden_states, temb=None):
 
 
 class OutValueFunctionBlock(nn.Layer):
-    def __init__(self, fc_dim, embed_dim, act_fn="mish"):
+    def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"):
         super().__init__()
         self.final_block = nn.LayerList(
             [
@@ -278,7 +278,7 @@ def forward(self, hidden_states, temb):
 
 
 class Downsample1d(nn.Layer):
-    def __init__(self, kernel="linear", pad_mode="reflect"):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
         super().__init__()
         self.pad_mode = pad_mode
         kernel_1d = paddle.to_tensor(_kernels[kernel])
@@ -296,7 +296,7 @@ def forward(self, hidden_states):
 
 
 class Upsample1d(nn.Layer):
-    def __init__(self, kernel="linear", pad_mode="reflect"):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
         super().__init__()
         self.pad_mode = pad_mode
         kernel_1d = paddle.to_tensor(_kernels[kernel])
@@ -314,7 +314,7 @@ def forward(self, hidden_states, temb=None):
 
 
 class SelfAttention1d(nn.Layer):
-    def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
+    def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0):
         super().__init__()
         self.channels = in_channels
         self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
@@ -333,6 +333,12 @@ def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
         self._use_memory_efficient_attention_xformers = False
         self._attention_op = None
 
+    def transpose_for_scores(self, projection: paddle.Tensor) -> paddle.Tensor:
+        new_projection_shape = projection.shape[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.reshape(new_projection_shape).transpose([0, 2, 1, 3])
+        return new_projection
+
     def reshape_heads_to_batch_dim(self, tensor, transpose=True):
         tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
         if transpose:
@@ -372,6 +378,7 @@ def set_use_memory_efficient_attention_xformers(
 
     def forward(self, hidden_states):
         residual = hidden_states
+        # batch, channel_dim, seq = hidden_states.shape
 
         hidden_states = self.group_norm(hidden_states)
         hidden_states = hidden_states.transpose([0, 2, 1])
@@ -603,7 +610,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
 
 
 class UpBlock1D(nn.Layer):
-    def __init__(self, in_channels, out_channels, mid_channels=None):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
         super().__init__()
         mid_channels = in_channels if mid_channels is None else mid_channels
 
@@ -651,7 +658,20 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         return hidden_states
 
 
-def get_down_block(down_block_type, num_layers, in_channels, out_channels, temb_channels, add_downsample):
+DownBlockType = Union[DownResnetBlock1D, DownBlock1D, AttnDownBlock1D, DownBlock1DNoSkip]
+MidBlockType = Union[MidResTemporalBlock1D, ValueFunctionMidBlock1D, UNetMidBlock1D]
+OutBlockType = Union[OutConv1DBlock, OutValueFunctionBlock]
+UpBlockType = Union[UpResnetBlock1D, UpBlock1D, AttnUpBlock1D, UpBlock1DNoSkip]
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+) -> DownBlockType:
     if down_block_type == "DownResnetBlock1D":
         return DownResnetBlock1D(
             in_channels=in_channels,
@@ -669,7 +689,9 @@ def get_down_block(down_block_type, num_layers, in_channels, out_channels, temb_
     raise ValueError(f"{down_block_type} does not exist.")
 
 
-def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample):
+def get_up_block(
+    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
+) -> UpBlockType:
     if up_block_type == "UpResnetBlock1D":
         return UpResnetBlock1D(
             in_channels=in_channels,
@@ -687,7 +709,15 @@ def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_chan
     raise ValueError(f"{up_block_type} does not exist.")
 
 
-def get_mid_block(mid_block_type, num_layers, in_channels, mid_channels, out_channels, embed_dim, add_downsample):
+def get_mid_block(
+    mid_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    mid_channels: int,
+    out_channels: int,
+    embed_dim: int,
+    add_downsample: bool,
+) -> MidBlockType:
     if mid_block_type == "MidResTemporalBlock1D":
         return MidResTemporalBlock1D(
             num_layers=num_layers,
@@ -703,7 +733,9 @@ def get_mid_block(mid_block_type, num_layers, in_channels, mid_channels, out_cha
     raise ValueError(f"{mid_block_type} does not exist.")
 
 
-def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim):
+def get_out_block(
+    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
+) -> Optional[OutBlockType]:
     if out_block_type == "OutConv1DBlock":
         return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
     elif out_block_type == "ValueFunction":
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d.py b/ppdiffusers/ppdiffusers/models/unet_2d.py
index b7eb75f02..6be3582e8 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d.py
@@ -72,9 +72,14 @@ class UNet2DModel(ModelMixin, ConfigMixin):
             The downsample type for downsampling layers. Choose between "conv" and "resnet"
         upsample_type (`str`, *optional*, defaults to `conv`):
             The upsample type for upsampling layers. Choose between "conv" and "resnet"
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
         attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
         norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
+        attn_norm_num_groups (`int`, *optional*, defaults to `None`):
+            If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
+            given number of groups. If left as `None`, the group norm layer will only be created if
+            `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
         norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
@@ -104,14 +109,17 @@ def __init__(
         downsample_padding: int = 1,
         downsample_type: str = "conv",
         upsample_type: str = "conv",
+        dropout: float = 0.0,
         act_fn: str = "silu",
         attention_head_dim: Optional[int] = 8,
         norm_num_groups: int = 32,
+        attn_norm_num_groups: Optional[int] = None,
         norm_eps: float = 1e-5,
         resnet_time_scale_shift: str = "default",
         add_attention: bool = True,
         class_embed_type: Optional[str] = None,
         num_class_embeds: Optional[int] = None,
+        num_train_timesteps: Optional[int] = None,
         resnet_pre_temb_non_linearity: Optional[bool] = False,
     ):
         super().__init__()
@@ -140,6 +148,9 @@ def __init__(
         elif time_embedding_type == "positional":
             self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
+        elif time_embedding_type == "learned":
+            self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
+            timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
@@ -183,6 +194,7 @@ def __init__(
                 downsample_padding=downsample_padding,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 downsample_type=downsample_type,
+                dropout=dropout,
                 resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
             )
             self.down_blocks.append(down_block)
@@ -191,12 +203,14 @@ def __init__(
         self.mid_block = UNetMidBlock2D(
             in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
+            dropout=dropout,
             resnet_eps=norm_eps,
             resnet_act_fn=act_fn,
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift=resnet_time_scale_shift,
             attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
             resnet_groups=norm_num_groups,
+            attn_groups=attn_norm_num_groups,
             add_attention=add_attention,
             resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
         )
@@ -225,6 +239,7 @@ def __init__(
                 attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 upsample_type=upsample_type,
+                dropout=dropout,
                 resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
             )
             self.up_blocks.append(up_block)
@@ -304,6 +319,8 @@ def forward(
                 class_labels = class_labels.cast(paddle.int64)
             class_emb = self.class_embedding(class_labels).cast(sample.dtype)
             emb = emb + class_emb
+        elif self.class_embedding is None and class_labels is not None:
+            raise ValueError("class_embedding needs to be initialized in order to use class conditioning")
 
         # 2. pre-process
         skip_sample = sample
@@ -348,7 +365,7 @@ def forward(
             sample += skip_sample
 
         if self.config.time_embedding_type == "fourier":
-            timesteps = timesteps.reshape([sample.shape[0], *([1] * len(sample.shape[1:]))])
+            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
             sample = sample / timesteps
 
         if not return_dict:
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
index 44bda0312..e4e5e1ce0 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 import paddle
@@ -20,13 +20,15 @@
 from paddle.distributed.fleet.utils import recompute
 
 from ..utils import is_ppxformers_available, logging
-from .attention import AdaGroupNorm
+from ..utils.paddle_utils import apply_freeu
+from .activations import get_activation
 from .attention_processor import (
     Attention,
     AttnAddedKVProcessor,
     AttnAddedKVProcessor2_5,
 )
 from .dual_transformer_2d import DualTransformer2DModel
+from .normalization import AdaGroupNorm
 from .resnet import (
     Downsample2D,
     FirDownsample2D,
@@ -42,29 +44,31 @@
 
 
 def get_down_block(
-    down_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    temb_channels,
-    add_downsample,
-    resnet_eps,
-    resnet_act_fn,
-    transformer_layers_per_block=1,
-    num_attention_heads=None,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    downsample_padding=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-    resnet_skip_time_act=False,
-    resnet_out_scale_factor=1.0,
-    cross_attention_norm=None,
-    attention_head_dim=None,
-    downsample_type=None,
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
     resnet_pre_temb_non_linearity=False,
 ):
     # If attn head dim is not defined, we default it to the number of heads
@@ -81,6 +85,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -95,6 +100,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -114,6 +120,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
@@ -132,6 +139,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -144,6 +152,7 @@ def get_down_block(
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
             resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
         )
     elif down_block_type == "SimpleCrossAttnDownBlock2D":
@@ -154,6 +163,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -173,6 +183,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -186,6 +197,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -198,6 +210,7 @@ def get_down_block(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -211,6 +224,7 @@ def get_down_block(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -226,6 +240,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -237,6 +252,7 @@ def get_down_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            dropout=dropout,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -249,29 +265,32 @@ def get_down_block(
 
 
 def get_up_block(
-    up_block_type,
-    num_layers,
-    in_channels,
-    out_channels,
-    prev_output_channel,
-    temb_channels,
-    add_upsample,
-    resnet_eps,
-    resnet_act_fn,
-    transformer_layers_per_block=1,
-    num_attention_heads=None,
-    resnet_groups=None,
-    cross_attention_dim=None,
-    dual_cross_attention=False,
-    use_linear_projection=False,
-    only_cross_attention=False,
-    upcast_attention=False,
-    resnet_time_scale_shift="default",
-    resnet_skip_time_act=False,
-    resnet_out_scale_factor=1.0,
-    cross_attention_norm=None,
-    attention_head_dim=None,
-    upsample_type=None,
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
     resnet_pre_temb_non_linearity=False,
 ):
     # If attn head dim is not defined, we default it to the number of heads
@@ -289,6 +308,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -303,6 +324,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -322,6 +345,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -333,6 +358,7 @@ def get_up_block(
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
             resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
         )
     elif up_block_type == "SimpleCrossAttnUpBlock2D":
@@ -344,6 +370,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -369,6 +397,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
@@ -384,6 +414,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -397,6 +429,8 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -409,6 +443,8 @@ def get_up_block(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -422,6 +458,8 @@ def get_up_block(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -437,6 +475,8 @@ def get_up_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -448,6 +488,8 @@ def get_up_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -459,7 +501,74 @@ def get_up_block(
     raise ValueError(f"{up_block_type} does not exist.")
 
 
+class AutoencoderTinyBlock(nn.Layer):
+    """
+    Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
+    blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        out_channels (`int`): The number of output channels.
+        act_fn (`str`):
+            ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
+
+    Returns:
+        `paddle.Tensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
+        `out_channels`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, act_fn: str):
+        super().__init__()
+        act_fn = get_activation(act_fn)
+        self.conv = nn.Sequential(
+            nn.Conv2D(in_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2D(out_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2D(out_channels, out_channels, kernel_size=3, padding=1),
+        )
+        self.skip = (
+            nn.Conv2D(in_channels, out_channels, kernel_size=1, bias_attr=False)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.fuse = nn.ReLU()
+
+    def forward(self, x):
+        return self.fuse(self.conv(x) + self.skip(x))
+
+
 class UNetMidBlock2D(nn.Layer):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `paddle.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
     def __init__(
         self,
         in_channels: int,
@@ -470,6 +579,7 @@ def __init__(
         resnet_time_scale_shift: str = "default",  # default, spatial
         resnet_act_fn: str = "swish",
         resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
         resnet_pre_norm: bool = True,
         add_attention: bool = True,
         attention_head_dim: int = 1,
@@ -480,6 +590,9 @@ def __init__(
         resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         self.add_attention = add_attention
 
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
         # there is always at least one resnet
         resnets = [
             ResnetBlock2D(
@@ -513,7 +626,7 @@ def __init__(
                         dim_head=attention_head_dim,
                         rescale_output_factor=output_scale_factor,
                         eps=resnet_eps,
-                        norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None,
+                        norm_num_groups=attn_groups,
                         spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
                         residual_connection=True,
                         bias=True,
@@ -560,7 +673,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
-        transformer_layers_per_block: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -572,6 +685,7 @@ def __init__(
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
         upcast_attention: bool = False,
+        attention_type: str = "default",
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -580,6 +694,10 @@ def __init__(
         self.num_attention_heads = num_attention_heads
         resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
 
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
         # there is always at least one resnet
         resnets = [
             ResnetBlock2D(
@@ -598,18 +716,19 @@ def __init__(
         ]
         attentions = []
 
-        for _ in range(num_layers):
+        for i in range(num_layers):
             if not dual_cross_attention:
                 attentions.append(
                     Transformer2DModel(
                         num_attention_heads,
                         in_channels // num_attention_heads,
                         in_channels=in_channels,
-                        num_layers=transformer_layers_per_block,
+                        num_layers=transformer_layers_per_block[i],
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         upcast_attention=upcast_attention,
+                        attention_type=attention_type,
                     )
                 )
             else:
@@ -642,6 +761,8 @@ def __init__(
         self.attentions = nn.LayerList(attentions)
         self.resnets = nn.LayerList(resnets)
 
+        self.gradient_checkpointing = False
+
     def forward(
         self,
         hidden_states: paddle.Tensor,
@@ -651,17 +772,39 @@ def forward(
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         encoder_attention_mask: Optional[paddle.Tensor] = None,
     ) -> paddle.Tensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(
-                hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                cross_attention_kwargs=cross_attention_kwargs,
-                attention_mask=attention_mask,
-                encoder_attention_mask=encoder_attention_mask,
-                return_dict=False,
-            )[0]
-            hidden_states = resnet(hidden_states, temb)
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0] when paddlepaddle <= 2.4.1
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                )  # [0]
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
 
         return hidden_states
 
@@ -681,9 +824,9 @@ def __init__(
         attention_head_dim: int = 1,
         output_scale_factor: float = 1.0,
         cross_attention_dim: int = 1280,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -760,8 +903,10 @@ def forward(
         attention_mask: Optional[paddle.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         encoder_attention_mask: Optional[paddle.Tensor] = None,
-    ):
+    ) -> paddle.Tensor:
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
         if attention_mask is None:
             # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
             mask = None if encoder_hidden_states is None else encoder_attention_mask
@@ -773,7 +918,7 @@ def forward(
             #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
             mask = attention_mask
 
-        hidden_states = self.resnets[0](hidden_states, temb)
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
             # attn
             hidden_states = attn(
@@ -784,7 +929,7 @@ def forward(
             )
 
             # resnet
-            hidden_states = resnet(hidden_states, temb)
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
 
         return hidden_states
 
@@ -805,7 +950,7 @@ def __init__(
         attention_head_dim: int = 1,
         output_scale_factor: float = 1.0,
         downsample_padding: int = 1,
-        downsample_type="conv",
+        downsample_type: str = "conv",
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -883,20 +1028,31 @@ def __init__(
         else:
             self.downsamplers = None
 
-    def forward(self, hidden_states, temb=None, upsample_size=None):
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        temb: Optional[paddle.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
         output_states = ()
 
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
+            cross_attention_kwargs.update({"scale": lora_scale})
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
             output_states = output_states + (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
                 if self.downsample_type == "resnet":
-                    hidden_states = downsampler(hidden_states, temb=temb)
+                    hidden_states = downsampler(hidden_states, temb=temb, scale=lora_scale)
                 else:
-                    hidden_states = downsampler(hidden_states)
+                    hidden_states = downsampler(hidden_states, scale=lora_scale)
 
             output_states += (hidden_states,)
 
@@ -911,7 +1067,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
-        transformer_layers_per_block: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -926,6 +1082,7 @@ def __init__(
         use_linear_projection: bool = False,
         only_cross_attention: bool = False,
         upcast_attention: bool = False,
+        attention_type: str = "default",
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -934,6 +1091,8 @@ def __init__(
 
         self.has_cross_attention = True
         self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
 
         for i in range(num_layers):
             in_channels = in_channels if i == 0 else out_channels
@@ -958,12 +1117,13 @@ def __init__(
                         num_attention_heads,
                         out_channels // num_attention_heads,
                         in_channels=out_channels,
-                        num_layers=transformer_layers_per_block,
+                        num_layers=transformer_layers_per_block[i],
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         only_cross_attention=only_cross_attention,
                         upcast_attention=upcast_attention,
+                        attention_type=attention_type,
                     )
                 )
             else:
@@ -1001,11 +1161,12 @@ def forward(
         attention_mask: Optional[paddle.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         encoder_attention_mask: Optional[paddle.Tensor] = None,
-        additional_residuals=None,
-    ):
-        # TODO(Patrick, William) - attention mask is not used
+        additional_residuals: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
         output_states = ()
 
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
         blocks = list(zip(self.resnets, self.attentions))
 
         for i, (resnet, attn) in enumerate(blocks):
@@ -1057,7 +1218,7 @@ def custom_forward(*inputs):
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
 
             output_states = output_states + (hidden_states,)
 
@@ -1118,7 +1279,9 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def forward(self, hidden_states, temb=None):
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
         output_states = ()
 
         for resnet in self.resnets:
@@ -1132,13 +1295,13 @@ def custom_forward(*inputs):
 
                 hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=scale)
 
             output_states = output_states + (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
+                hidden_states = downsampler(hidden_states, scale=scale)
 
             output_states = output_states + (hidden_states,)
 
@@ -1196,13 +1359,13 @@ def __init__(
         else:
             self.downsamplers = None
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: paddle.Tensor, scale: float = 1.0) -> paddle.Tensor:
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
+                hidden_states = downsampler(hidden_states, scale)
 
         return hidden_states
 
@@ -1281,14 +1444,15 @@ def __init__(
         else:
             self.downsamplers = None
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states: paddle.Tensor, scale: float = 1.0) -> paddle.Tensor:
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
-            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
+                hidden_states = downsampler(hidden_states, scale)
 
         return hidden_states
 
@@ -1376,16 +1540,23 @@ def __init__(
             self.downsamplers = None
             self.skip_conv = None
 
-    def forward(self, hidden_states, temb=None, skip_sample=None):
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        temb: Optional[paddle.Tensor] = None,
+        skip_sample: Optional[paddle.Tensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...], paddle.Tensor]:
         output_states = ()
 
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
             output_states += (hidden_states,)
 
         if self.downsamplers is not None:
-            hidden_states = self.resnet_down(hidden_states, temb)
+            hidden_states = self.resnet_down(hidden_states, temb, scale=scale)
             for downsampler in self.downsamplers:
                 skip_sample = downsampler(skip_sample)
 
@@ -1459,15 +1630,21 @@ def __init__(
             self.downsamplers = None
             self.skip_conv = None
 
-    def forward(self, hidden_states, temb=None, skip_sample=None):
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        temb: Optional[paddle.Tensor] = None,
+        skip_sample: Optional[paddle.Tensor] = None,
+        scale: float = 1.0,
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...], paddle.Tensor]:
         output_states = ()
 
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
+            hidden_states = resnet(hidden_states, temb, scale)
             output_states += (hidden_states,)
 
         if self.downsamplers is not None:
-            hidden_states = self.resnet_down(hidden_states, temb)
+            hidden_states = self.resnet_down(hidden_states, temb, scale)
             for downsampler in self.downsamplers:
                 skip_sample = downsampler(skip_sample)
 
@@ -1545,7 +1722,9 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def forward(self, hidden_states, temb=None):
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
         output_states = ()
 
         for resnet in self.resnets:
@@ -1559,13 +1738,13 @@ def custom_forward(*inputs):
 
                 hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale)
 
             output_states = output_states + (hidden_states,)
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, temb)
+                hidden_states = downsampler(hidden_states, temb, scale)
 
             output_states = output_states + (hidden_states,)
 
@@ -1589,9 +1768,9 @@ def __init__(
         cross_attention_dim: int = 1280,
         output_scale_factor: float = 1.0,
         add_downsample: bool = True,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -1678,6 +1857,8 @@ def forward(
         output_states = ()
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
 
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+
         if attention_mask is None:
             # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
             mask = None if encoder_hidden_states is None else encoder_attention_mask
@@ -1710,7 +1891,7 @@ def custom_forward(*inputs):
                     cross_attention_kwargs,
                 )
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
 
                 hidden_states = attn(
                     hidden_states,
@@ -1723,7 +1904,7 @@ def custom_forward(*inputs):
 
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states, temb)
+                hidden_states = downsampler(hidden_states, temb, scale=lora_scale)
 
             output_states = output_states + (hidden_states,)
 
@@ -1778,7 +1959,9 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def forward(self, hidden_states, temb=None):
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
+    ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
         output_states = ()
 
         for resnet in self.resnets:
@@ -1792,7 +1975,7 @@ def custom_forward(*inputs):
 
                 hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale)
 
             output_states += (hidden_states,)
 
@@ -1813,7 +1996,7 @@ def __init__(
         dropout: float = 0.0,
         num_layers: int = 4,
         resnet_group_size: int = 32,
-        add_downsample=True,
+        add_downsample: bool = True,
         attention_head_dim: int = 64,
         add_self_attention: bool = False,
         resnet_eps: float = 1e-5,
@@ -1880,6 +2063,7 @@ def forward(
         encoder_attention_mask: Optional[paddle.Tensor] = None,
     ):
         output_states = ()
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
 
         for resnet, attn in zip(self.resnets, self.attentions):
             if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -1904,7 +2088,7 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                 )
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -1933,6 +2117,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -1942,7 +2127,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         attention_head_dim: int = 1,
         output_scale_factor: float = 1.0,
-        upsample_type="conv",
+        upsample_type: str = "conv",
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -2017,22 +2202,32 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> paddle.Tensor:
         for resnet, attn in zip(self.resnets, self.attentions):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
             hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
-            hidden_states = resnet(hidden_states, temb)
-            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
                 if self.upsample_type == "resnet":
-                    hidden_states = upsampler(hidden_states, temb=temb)
+                    hidden_states = upsampler(hidden_states, temb=temb, scale=scale)
                 else:
-                    hidden_states = upsampler(hidden_states)
+                    hidden_states = upsampler(hidden_states, scale=scale)
 
         return hidden_states
 
@@ -2044,9 +2239,10 @@ def __init__(
         out_channels: int,
         prev_output_channel: int,
         temb_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
-        transformer_layers_per_block: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -2060,6 +2256,7 @@ def __init__(
         use_linear_projection: bool = False,
         only_cross_attention: bool = False,
         upcast_attention: bool = False,
+        attention_type: str = "default",
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -2069,6 +2266,9 @@ def __init__(
         self.has_cross_attention = True
         self.num_attention_heads = num_attention_heads
 
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
         for i in range(num_layers):
             res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
@@ -2094,12 +2294,13 @@ def __init__(
                         num_attention_heads,
                         out_channels // num_attention_heads,
                         in_channels=out_channels,
-                        num_layers=transformer_layers_per_block,
+                        num_layers=transformer_layers_per_block[i],
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
                         only_cross_attention=only_cross_attention,
                         upcast_attention=upcast_attention,
+                        attention_type=attention_type,
                     )
                 )
             else:
@@ -2122,6 +2323,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(
         self,
@@ -2133,11 +2335,32 @@ def forward(
         upsample_size: Optional[int] = None,
         attention_mask: Optional[paddle.Tensor] = None,
         encoder_attention_mask: Optional[paddle.Tensor] = None,
-    ):
+    ) -> paddle.Tensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
         for resnet, attn in zip(self.resnets, self.attentions):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
             hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -2163,7 +2386,7 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                 )
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -2175,7 +2398,7 @@ def custom_forward(*inputs):
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, upsample_size)
+                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
 
         return hidden_states
 
@@ -2187,6 +2410,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2229,12 +2453,40 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> paddle.Tensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
             hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
             if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -2261,6 +2513,7 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2270,7 +2523,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor: float = 1.0,
         add_upsample: bool = True,
-        temb_channels=None,
+        temb_channels: Optional[int] = None,
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -2302,9 +2555,13 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, temb=None):
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
+    ) -> paddle.Tensor:
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -2318,6 +2575,7 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2328,7 +2586,7 @@ def __init__(
         attention_head_dim: int = 1,
         output_scale_factor: float = 1.0,
         add_upsample: bool = True,
-        temb_channels=None,
+        temb_channels: Optional[int] = None,
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -2383,14 +2641,19 @@ def __init__(
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, temb=None):
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
+    ) -> paddle.Tensor:
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=temb)
-            hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+            cross_attention_kwargs = {"scale": scale}
+            hidden_states = attn(hidden_states, temb=temb, **cross_attention_kwargs)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
+                hidden_states = upsampler(hidden_states, scale=scale)
 
         return hidden_states
 
@@ -2402,6 +2665,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2488,16 +2752,26 @@ def __init__(
             self.skip_norm = None
             self.act = None
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
             hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
-            hidden_states = resnet(hidden_states, temb)
+            hidden_states = resnet(hidden_states, temb, scale=scale)
 
-        hidden_states = self.attentions[0](hidden_states)
+        cross_attention_kwargs = {"scale": scale}
+        hidden_states = self.attentions[0](hidden_states, **cross_attention_kwargs)
 
         if skip_sample is not None:
             skip_sample = self.upsampler(skip_sample)
@@ -2511,7 +2785,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample
 
             skip_sample = skip_sample + skip_sample_states
 
-            hidden_states = self.resnet_up(hidden_states, temb)
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
 
         return hidden_states, skip_sample
 
@@ -2523,6 +2797,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2588,14 +2863,23 @@ def __init__(
             self.skip_norm = None
             self.act = None
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        skip_sample=None,
+        scale: float = 1.0,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
             hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
 
-            hidden_states = resnet(hidden_states, temb)
+            hidden_states = resnet(hidden_states, temb, scale=scale)
 
         if skip_sample is not None:
             skip_sample = self.upsampler(skip_sample)
@@ -2609,7 +2893,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample
 
             skip_sample = skip_sample + skip_sample_states
 
-            hidden_states = self.resnet_up(hidden_states, temb)
+            hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
 
         return hidden_states, skip_sample
 
@@ -2621,6 +2905,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2630,7 +2915,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor: float = 1.0,
         add_upsample: bool = True,
-        skip_time_act=False,
+        skip_time_act: bool = False,
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -2683,8 +2968,16 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> paddle.Tensor:
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
@@ -2701,11 +2994,11 @@ def custom_forward(*inputs):
 
                 hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=scale)
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, temb)
+                hidden_states = upsampler(hidden_states, temb, scale=scale)
 
         return hidden_states
 
@@ -2717,6 +3010,7 @@ def __init__(
         out_channels: int,
         prev_output_channel: int,
         temb_channels: int,
+        resolution_idx: Optional[int] = None,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2728,9 +3022,9 @@ def __init__(
         cross_attention_dim: int = 1280,
         output_scale_factor: float = 1.0,
         add_upsample: bool = True,
-        skip_time_act=False,
-        only_cross_attention=False,
-        cross_attention_norm=None,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
@@ -2805,6 +3099,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(
         self,
@@ -2816,9 +3111,10 @@ def forward(
         attention_mask: Optional[paddle.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         encoder_attention_mask: Optional[paddle.Tensor] = None,
-    ):
+    ) -> paddle.Tensor:
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
 
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
         if attention_mask is None:
             # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
             mask = None if encoder_hidden_states is None else encoder_attention_mask
@@ -2857,7 +3153,7 @@ def custom_forward(*inputs):
                     cross_attention_kwargs,
                 )
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
 
                 hidden_states = attn(
                     hidden_states,
@@ -2868,7 +3164,7 @@ def custom_forward(*inputs):
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states, temb)
+                hidden_states = upsampler(hidden_states, temb, scale=lora_scale)
 
         return hidden_states
 
@@ -2879,6 +3175,7 @@ def __init__(
         in_channels: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 5,
         resnet_eps: float = 1e-5,
@@ -2922,8 +3219,16 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> paddle.Tensor:
         res_hidden_states_tuple = res_hidden_states_tuple[-1]
         if res_hidden_states_tuple is not None:
             hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
@@ -2954,12 +3259,13 @@ def __init__(
         in_channels: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 4,
         resnet_eps: float = 1e-5,
         resnet_act_fn: str = "gelu",
         resnet_group_size: int = 32,
-        attention_head_dim=1,  # attention dim_head
+        attention_head_dim: int = 1,  # attention dim_head
         cross_attention_dim: int = 768,
         add_upsample: bool = True,
         upcast_attention: bool = False,
@@ -3049,6 +3355,7 @@ def forward(
         if res_hidden_states_tuple is not None:
             hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
 
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
         for resnet, attn in zip(self.resnets, self.attentions):
             if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
 
@@ -3076,7 +3383,7 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                 )
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
@@ -3104,11 +3411,18 @@ class KAttentionBlock(nn.Layer):
         attention_head_dim (`int`): The number of channels in each head.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
         cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Configure if the attention layers should contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to upcast the attention computation to `float32`.
+        temb_channels (`int`, *optional*, defaults to 768):
+            The number of channels in the token embedding.
+        add_self_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add self-attention to the block.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        group_size (`int`, *optional*, defaults to 32):
+            The number of groups to separate the channels into for group normalization.
     """
 
     def __init__(
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
index 1cada518e..d7da8828e 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
@@ -20,14 +20,21 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import NEG_INF, BaseOutput, logging
+from ..utils import NEG_INF, BaseOutput, deprecate, logging
 from .activations import get_activation
-from .attention_processor import AttentionProcessor, AttnProcessor
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .embeddings import (
     GaussianFourierProjection,
     ImageHintTimeEmbedding,
     ImageProjection,
     ImageTimeEmbedding,
+    PositionNet,
     TextImageProjection,
     TextImageTimeEmbedding,
     TextTimeEmbedding,
@@ -36,12 +43,9 @@
 )
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    CrossAttnUpBlock2D,
-    DownBlock2D,
+    UNetMidBlock2D,
     UNetMidBlock2DCrossAttn,
     UNetMidBlock2DSimpleCrossAttn,
-    UpBlock2D,
     get_down_block,
     get_up_block,
 )
@@ -82,7 +86,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
             The tuple of downsample blocks to use.
         mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
-            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
             `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
         up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
             The tuple of upsample blocks to use.
@@ -94,16 +98,22 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
         downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
         mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
             If `None`, normalization and activation layers is skipped in post-processing.
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
         cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
             The dimension of the cross attention features.
-        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
             [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
             [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
         encoder_hid_dim (`int`, *optional*, defaults to None):
             If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
             dimension to `cross_attention_dim`.
@@ -137,9 +147,9 @@ class conditioning with `class_embed_type` equal to `None`.
             The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
         time_cond_proj_dim (`int`, *optional*, defaults to `None`):
             The dimension of `cond_proj` layer in the timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
-        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
             `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
         class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
             embeddings with the class embeddings.
@@ -174,11 +184,13 @@ def __init__(
         layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
         act_fn: str = "silu",
         norm_num_groups: Optional[int] = 32,
         norm_eps: float = 1e-5,
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
         encoder_hid_dim: Optional[int] = None,
         encoder_hid_dim_type: Optional[str] = None,
         attention_head_dim: Union[int, Tuple[int]] = 8,
@@ -201,6 +213,7 @@ def __init__(
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
         class_embeddings_concat: bool = False,
         mid_block_only_cross_attention: Optional[bool] = None,
         cross_attention_norm: Optional[str] = None,
@@ -259,6 +272,10 @@ def __init__(
             raise ValueError(
                 f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
             )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
 
         # input
         conv_in_padding = (conv_in_kernel - 1) // 2
@@ -424,11 +441,6 @@ def __init__(
         else:
             blocks_time_embed_dim = time_embed_dim
 
-        # pre_temb_act_fun opt
-        self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity
-        if resnet_pre_temb_non_linearity:
-            self.down_resnet_temb_nonlinearity = get_activation(act_fn)
-
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -455,10 +467,12 @@ def __init__(
                 only_cross_attention=only_cross_attention[i],
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
                 attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
                 resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
             )
             self.down_blocks.append(down_block)
@@ -469,6 +483,7 @@ def __init__(
                 transformer_layers_per_block=transformer_layers_per_block[-1],
                 in_channels=block_out_channels[-1],
                 temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
@@ -479,12 +494,14 @@ def __init__(
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
                 upcast_attention=upcast_attention,
+                attention_type=attention_type,
                 resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
             )
         elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
             self.mid_block = UNetMidBlock2DSimpleCrossAttn(
                 in_channels=block_out_channels[-1],
                 temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
@@ -497,6 +514,20 @@ def __init__(
                 cross_attention_norm=cross_attention_norm,
                 resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
             )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+                resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
+            )
         elif mid_block_type is None:
             self.mid_block = None
         else:
@@ -510,8 +541,12 @@ def __init__(
         reversed_num_attention_heads = list(reversed(num_attention_heads))
         reversed_layers_per_block = list(reversed(layers_per_block))
         reversed_cross_attention_dim = list(reversed(cross_attention_dim))
-        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
-        reversed_only_cross_attention = list(reversed(only_cross_attention))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
 
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
@@ -539,18 +574,21 @@ def __init__(
                 add_upsample=add_upsample,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
+                resolution_idx=i,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=reversed_cross_attention_dim[i],
                 num_attention_heads=reversed_num_attention_heads[i],
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
-                only_cross_attention=reversed_only_cross_attention[i],
+                only_cross_attention=only_cross_attention[i],
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
                 attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
                 resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
             )
             self.up_blocks.append(up_block)
@@ -571,6 +609,18 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
     @property
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
@@ -582,8 +632,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         processors = {}
 
         def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
 
             for sub_name, child in module.named_children():
                 fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -595,7 +645,9 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
 
         return processors
 
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
         r"""
         Sets the attention processor to use to compute attention.
 
@@ -619,9 +671,9 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
         def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
             if hasattr(module, "set_processor"):
                 if not isinstance(processor, dict):
-                    module.set_processor(processor)
+                    module.set_processor(processor, _remove_lora=_remove_lora)
                 else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
 
             for sub_name, child in module.named_children():
                 fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -633,7 +685,16 @@ def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
         """
-        self.set_attn_processor(AttnProcessor())
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
 
     def set_attention_slice(self, slice_size):
         r"""
@@ -701,9 +762,41 @@ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
             fn_recursive_set_attention_slice(module, reversed_slice_size)
 
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+        if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value
 
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
     def forward(
         self,
         sample: paddle.Tensor,
@@ -716,6 +809,7 @@ def forward(
         added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
         down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
         mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
         encoder_attention_mask: Optional[paddle.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
@@ -723,12 +817,32 @@ def forward(
         The [`UNet2DConditionModel`] forward method.
 
         Args:
-            sample (`paddle.Tensor`):
+            sample (`torch.FloatTensor`):
                 The noisy input tensor with the following shape `(batch, channel, height, width)`.
-            timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
-            encoder_hidden_states (`paddle.Tensor`):
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
                 The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
-            encoder_attention_mask (`paddle.Tensor`):
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
                 A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
                 `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
                 which adds large negative values to the attention scores corresponding to "discard" tokens.
@@ -740,6 +854,13 @@ def forward(
             added_cond_kwargs: (`dict`, *optional*):
                 A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
                 are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
 
         Returns:
             [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
@@ -915,18 +1036,40 @@ def forward(
         # 2. pre-process
         sample = self.conv_in(sample)
 
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
         # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
 
         is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
-        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
 
         down_block_res_samples = (sample,)
         for downsample_block in self.down_blocks:
             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                 # For t2i-adapter CrossAttnDownBlock2D
                 additional_residuals = {}
-                if is_adapter and len(down_block_additional_residuals) > 0:
-                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
 
                 sample, res_samples = downsample_block(
                     hidden_states=sample,
@@ -938,12 +1081,17 @@ def forward(
                     **additional_residuals,
                 )
             else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                # paddle original code:
+                # sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+                # if is_adapter and len(down_block_additional_residuals) > 0:
+                #     sample += down_block_additional_residuals.pop(0)
+                #     # westfish: add to align with torch features
+                #     res_samples = tuple(res_samples[:-1]) + (sample,)
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
 
-                if is_adapter and len(down_block_additional_residuals) > 0:
-                    sample += down_block_additional_residuals.pop(0)
-                    # westfish: add to align with torch features
-                    res_samples = tuple(res_samples[:-1]) + (sample,)
             down_block_res_samples += res_samples
 
         if is_controlnet:
@@ -959,14 +1107,25 @@ def forward(
 
         # 4. mid
         if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-                encoder_attention_mask=encoder_attention_mask,
-            )
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
 
         if is_controlnet:
             sample = sample + mid_block_additional_residual
@@ -1000,6 +1159,7 @@ def forward(
                     temb=emb,
                     res_hidden_states_tuple=res_samples,
                     upsample_size=upsample_size,
+                    scale=lora_scale,
                 )
 
         # 6. post-process
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
index d395de074..36abadec7 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
@@ -13,9 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any, Dict, Optional, Tuple
+
 import paddle
 import paddle.nn as nn
+from paddle.distributed.fleet.utils import recompute
 
+from ..utils.paddle_utils import apply_freeu
+from .dual_transformer_2d import DualTransformer2DModel
 from .resnet import Downsample2D, ResnetBlock2D, TemporalConvLayer, Upsample2D
 from .transformer_2d import Transformer2DModel
 from .transformer_temporal import TransformerTemporalModel
@@ -39,6 +44,8 @@ def get_down_block(
     only_cross_attention=False,
     upcast_attention=False,
     resnet_time_scale_shift="default",
+    temporal_num_attention_heads=8,
+    temporal_max_seq_length=32,
 ):
     if down_block_type == "DownBlock3D":
         return DownBlock3D(
@@ -74,6 +81,45 @@ def get_down_block(
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
         )
+    if down_block_type == "DownBlockMotion":
+        return DownBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+    elif down_block_type == "CrossAttnDownBlockMotion":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion")
+        return CrossAttnDownBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+
     raise ValueError(f"{down_block_type} does not exist.")
 
 
@@ -88,6 +134,7 @@ def get_up_block(
     resnet_eps,
     resnet_act_fn,
     num_attention_heads,
+    resolution_idx=None,
     resnet_groups=None,
     cross_attention_dim=None,
     dual_cross_attention=False,
@@ -95,6 +142,9 @@ def get_up_block(
     only_cross_attention=False,
     upcast_attention=False,
     resnet_time_scale_shift="default",
+    temporal_num_attention_heads=8,
+    temporal_cross_attention_dim=None,
+    temporal_max_seq_length=32,
 ):
     if up_block_type == "UpBlock3D":
         return UpBlock3D(
@@ -108,6 +158,7 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
         )
     elif up_block_type == "CrossAttnUpBlock3D":
         if cross_attention_dim is None:
@@ -129,6 +180,47 @@ def get_up_block(
             only_cross_attention=only_cross_attention,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+        )
+    if up_block_type == "UpBlockMotion":
+        return UpBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+    elif up_block_type == "CrossAttnUpBlockMotion":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion")
+        return CrossAttnUpBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
         )
     raise ValueError(f"{up_block_type} does not exist.")
 
@@ -477,17 +569,21 @@ def __init__(
         use_linear_projection=False,
         only_cross_attention=False,
         upcast_attention=False,
+        resolution_idx=None,
     ):
         super().__init__()
         resnets = []
         temp_convs = []
         attentions = []
         temp_attentions = []
+
         self.has_cross_attention = True
         self.num_attention_heads = num_attention_heads
+
         for i in range(num_layers):
             res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
             resnets.append(
                 ResnetBlock2D(
                     in_channels=resnet_in_channels + res_skip_channels,
@@ -542,7 +638,9 @@ def __init__(
             )
         else:
             self.upsamplers = None
+
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(
         self,
@@ -555,6 +653,13 @@ def forward(
         num_frames=1,
         cross_attention_kwargs=None,
     ):
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
         # TODO(Patrick, William) - attention mask is not used
         for resnet, temp_conv, attn, temp_attn in zip(
             self.resnets, self.temp_convs, self.attentions, self.temp_attentions
@@ -562,6 +667,19 @@ def forward(
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
             hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
@@ -599,13 +717,16 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        resolution_idx=None,
     ):
         super().__init__()
         resnets = []
         temp_convs = []
+
         for i in range(num_layers):
             res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
             resnets.append(
                 ResnetBlock2D(
                     in_channels=resnet_in_channels + res_skip_channels,
@@ -633,17 +754,794 @@ def __init__(
             self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
+
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, num_frames=1):
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
         for resnet, temp_conv in zip(self.resnets, self.temp_convs):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
             hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
             hidden_states = resnet(hidden_states, temb)
             hidden_states = temp_conv(hidden_states, num_frames=num_frames)
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
                 hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class DownBlockMotion(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        temporal_num_attention_heads=1,
+        temporal_cross_attention_dim=None,
+        temporal_max_seq_length=32,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.resnets = nn.LayerList(resnets)
+        self.motion_modules = nn.LayerList(motion_modules)
+
+        if add_downsample:
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states, temb=None, scale: float = 1.0, num_frames=1):
+        output_states = ()
+
+        blocks = zip(self.resnets, self.motion_modules)
+        for resnet, motion_module in blocks:
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(
+                    create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, num_frames
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+                hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockMotion(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        attention_type="default",
+        temporal_cross_attention_dim=None,
+        temporal_num_attention_heads=8,
+        temporal_max_seq_length=32,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.attentions = nn.LayerList(attentions)
+        self.resnets = nn.LayerList(resnets)
+        self.motion_modules = nn.LayerList(motion_modules)
+
+        if add_downsample:
+            self.downsamplers = nn.LayerList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        num_frames=1,
+        encoder_attention_mask=None,
+        cross_attention_kwargs=None,
+        additional_residuals=None,
+    ):
+        output_states = ()
+
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+
+        blocks = list(zip(self.resnets, self.attentions, self.motion_modules))
+        for i, (resnet, attn, motion_module) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0] when paddlepaddle <= 2.4.1
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                )  # [0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = motion_module(
+                    hidden_states,
+                    num_frames=num_frames,
+                )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlockMotion(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: int = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        attention_type="default",
+        temporal_cross_attention_dim=None,
+        temporal_num_attention_heads=8,
+        temporal_max_seq_length=32,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.attentions = nn.LayerList(attentions)
+        self.resnets = nn.LayerList(resnets)
+        self.motion_modules = nn.LayerList(motion_modules)
+
+        if add_upsample:
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
+        temb: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        num_frames=1,
+    ):
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        blocks = zip(self.resnets, self.attentions, self.motion_modules)
+        for resnet, attn, motion_module in blocks:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
+
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)[0]  # move [0] when paddlepaddle <= 2.4.1
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                )  # [0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = motion_module(
+                    hidden_states,
+                    num_frames=num_frames,
+                )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+
+        return hidden_states
+
+
+class UpBlockMotion(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        temporal_norm_num_groups=32,
+        temporal_cross_attention_dim=None,
+        temporal_num_attention_heads=8,
+        temporal_max_seq_length=32,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=temporal_norm_num_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.resnets = nn.LayerList(resnets)
+        self.motion_modules = nn.LayerList(motion_modules)
+
+        if add_upsample:
+            self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1.0, num_frames=1
+    ):
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        blocks = zip(self.resnets, self.motion_modules)
+
+        for resnet, motion_module in blocks:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
+
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+                hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+
+        return hidden_states
+
+
+class UNetMidBlockCrossAttnMotion(nn.Layer):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        attention_type="default",
+        temporal_num_attention_heads=1,
+        temporal_cross_attention_dim=None,
+        temporal_max_seq_length=32,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        motion_modules = []
+
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    attention_head_dim=in_channels // temporal_num_attention_heads,
+                    in_channels=in_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    activation_fn="geglu",
+                )
+            )
+
+        self.attentions = nn.LayerList(attentions)
+        self.resnets = nn.LayerList(resnets)
+        self.motion_modules = nn.LayerList(motion_modules)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        temb: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        num_frames=1,
+    ) -> paddle.Tensor:
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+
+        blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
+        for attn, resnet, motion_module in blocks:
+            if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                )  # [0]
+                hidden_states = recompute(create_custom_forward(motion_module), hidden_states, temb)
+                hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = motion_module(
+                    hidden_states,
+                    num_frames=num_frames,
+                )[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+
         return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
index 1224a5c29..0ad7c5875 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
@@ -22,7 +22,13 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
-from .attention_processor import AttentionProcessor, AttnProcessor
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .transformer_temporal import TransformerTemporalModel
@@ -235,9 +241,12 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
                 num_attention_heads=reversed_num_attention_heads[i],
                 dual_cross_attention=False,
+                resolution_idx=i,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
+
+        # out
         if norm_num_groups is not None:
             self.conv_norm_out = nn.GroupNorm(
                 num_channels=block_out_channels[0],
@@ -268,8 +277,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         processors = {}
 
         def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
 
             for sub_name, child in module.named_children():
                 fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -348,7 +357,9 @@ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
             fn_recursive_set_attention_slice(module, reversed_slice_size)
 
     # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
         r"""
         Sets the attention processor to use to compute attention.
 
@@ -372,9 +383,9 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
         def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
             if hasattr(module, "set_processor"):
                 if not isinstance(processor, dict):
-                    module.set_processor(processor)
+                    module.set_processor(processor, _remove_lora=_remove_lora)
                 else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
 
             for sub_name, child in module.named_children():
                 fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -428,12 +439,55 @@ def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
         """
-        self.set_attn_processor(AttnProcessor())
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
 
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
             module.gradient_checkpointing = value
 
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
     def forward(
         self,
         sample: paddle.Tensor,
@@ -456,6 +510,23 @@ def forward(
             timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
             encoder_hidden_states (`paddle.Tensor`):
                 The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`paddle.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            down_block_additional_residuals: (`tuple` of `paddle.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`paddle.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
@@ -513,6 +584,8 @@ def forward(
         emb = self.time_embedding(t_emb, timestep_cond)
         emb = emb.repeat_interleave(repeats=num_frames, axis=0)
         encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
+
+        # 2. pre-process
         sample = sample.transpose([0, 2, 1, 3, 4]).reshape(
             (sample.shape[0] * num_frames, -1) + tuple(sample.shape[3:])
         )
@@ -523,6 +596,7 @@ def forward(
             cross_attention_kwargs=cross_attention_kwargs,
             return_dict=False,
         )[0]
+
         # 3. down
         down_block_res_samples = (sample,)
         for downsample_block in self.down_blocks:
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index b3d9bf832..1d9d61b3c 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-import warnings
+
+# import warnings
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import paddle
@@ -24,6 +25,7 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import DiffusionPipeline
@@ -194,6 +196,34 @@ def _encode_prompt(
         prompt_embeds: Optional[paddle.Tensor] = None,
         negative_prompt_embeds: Optional[paddle.Tensor] = None,
         lora_scale: Optional[float] = None,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        lora_scale: Optional[float] = None,
     ):
         """
         Encodes the prompt into text encoder hidden states.
@@ -223,6 +253,10 @@ def _encode_prompt(
         # function of text encoder can correctly access it
         if lora_scale is not None and isinstance(self, LoraLoaderMixin):
             self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -258,7 +292,16 @@ def _encode_prompt(
                 attention_mask = None
             prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
             prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
+
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
@@ -305,8 +348,8 @@ def _encode_prompt(
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
+            # prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds, negative_prompt_embeds
 
     def run_safety_checker(self, image, dtype):
         if self.safety_checker is None:
@@ -323,10 +366,9 @@ def run_safety_checker(self, image, dtype):
         return image, has_nsfw_concept
 
     def decode_latents(self, latents):
-        warnings.warn(
-            "The decode_latents method is deprecated and will be removed in a future version. Please use VaeImageProcessor instead",
-            FutureWarning,
-        )
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents, return_dict=False)[0]
         image = (image / 2 + 0.5).clip(min=0, max=1)
@@ -510,10 +552,12 @@ def __call__(
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
         text_encoder_lora_scale = (
             cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
         )
-        prompt_embeds = self._encode_prompt(
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             num_images_per_prompt,
             do_classifier_free_guidance,
@@ -522,9 +566,14 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
         )
-        self.scheduler.set_timesteps(num_inference_steps)
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
         timesteps = self.scheduler.timesteps
 
         # 5. Prepare latent variables
@@ -541,9 +590,9 @@ def __call__(
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
@@ -571,7 +620,7 @@ def __call__(
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -588,5 +637,6 @@ def __call__(
         image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
-            return image, has_nsfw_concept
+            return (image, has_nsfw_concept)
+
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 357a83609..aef30a5ce 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -22,9 +22,12 @@
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import (
+    AttnProcessor2_5,
+    LoRAAttnProcessor2_5,
     LoRAXFormersAttnProcessor,
     XFormersAttnProcessor,
 )
+from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     is_invisible_watermark_available,
@@ -69,7 +72,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
 
 
-class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
+class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin):
     """
     Pipeline for text-to-image generation using Stable Diffusion XL.
 
@@ -107,6 +110,13 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
     """
 
     def __init__(
@@ -141,6 +151,39 @@ def __init__(
         else:
             self.watermark = None
 
+    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
     def encode_prompt(
         self,
         prompt: str,
@@ -196,6 +239,11 @@ def encode_prompt(
         # function of text encoder can correctly access it
         if lora_scale is not None and isinstance(self, LoraLoaderMixin):
             self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -225,14 +273,15 @@ def encode_prompt(
                 )
                 text_input_ids = text_inputs.input_ids
                 untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
+
                 if (
                     untruncated_ids.shape[-1] >= text_input_ids.shape[-1]
                     and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item()
                 ):
                     removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
                     logger.warning(
-                        f"The following part of your input was truncated because CLIP can only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}"
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                     )
                 prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True)
 
@@ -253,13 +302,16 @@ def encode_prompt(
             uncond_tokens: List[str]
             if prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
                 )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt, negative_prompt_2]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
                 )
             else:
                 uncond_tokens = [negative_prompt, negative_prompt_2]
@@ -279,8 +331,8 @@ def encode_prompt(
                 negative_prompt_embeds_list.append(negative_prompt_embeds)
             negative_prompt_embeds = paddle.concat(x=negative_prompt_embeds_list, axis=-1)
         prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
         prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
         if do_classifier_free_guidance:
@@ -338,15 +390,18 @@ def check_inputs(
             and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
             )
         if prompt is not None and prompt_embeds is not None:
             raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
             )
         elif prompt_2 is not None and prompt_embeds is not None:
             raise ValueError(
-                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
             )
         elif prompt is None and prompt_embeds is None:
             raise ValueError(
@@ -358,11 +413,13 @@ def check_inputs(
             raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
         if negative_prompt is not None and negative_prompt_embeds is not None:
             raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
         elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
             raise ValueError(
-                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
         if prompt_embeds is not None and negative_prompt_embeds is not None:
             if prompt_embeds.shape != negative_prompt_embeds.shape:
@@ -408,14 +465,16 @@ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, d
     def upcast_vae(self):
         dtype = self.vae.dtype
         self.vae.to(dtype="float32")
-        use_xformers = isinstance(
+        use_2_5_or_xformers = isinstance(
             self.vae.decoder.mid_block.attentions[0].processor,
             (
+                AttnProcessor2_5,
                 XFormersAttnProcessor,
                 LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_5,
             ),
         )
-        if use_xformers:
+        if use_2_5_or_xformers:
             self.vae.post_quant_conv.to(dtype=dtype)
             self.vae.decoder.conv_in.to(dtype=dtype)
             self.vae.decoder.mid_block.to(dtype=dtype)
@@ -450,6 +509,9 @@ def __call__(
         original_size: Optional[Tuple[int, int]] = None,
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
     ):
         """
         Function invoked when calling the pipeline for generation.
@@ -475,7 +537,7 @@ def __call__(
                 scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
                 "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
                 Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
-            guidance_scale (`float`, *optional*, defaults to 7.5):
+            guidance_scale (`float`, *optional*, defaults to 5.0):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -548,6 +610,21 @@ def __call__(
                 For most cases, `target_size` should be set to the desired height and width of the generated image. If
                 not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
                 section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
 
         Examples:
 
@@ -633,10 +710,20 @@ def __call__(
         add_time_ids = self._get_add_time_ids(
             original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
         )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
         if do_classifier_free_guidance:
             prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds], axis=0)
             add_text_embeds = paddle.concat(x=[negative_pooled_prompt_embeds, add_text_embeds], axis=0)
-            add_time_ids = paddle.concat(x=[add_time_ids, add_time_ids], axis=0)
+            add_time_ids = paddle.concat(x=[negative_add_time_ids, add_time_ids], axis=0)
 
         add_time_ids = add_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1])
 
@@ -682,7 +769,7 @@ def __call__(
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py
index ec3dda5e2..3bf46fdea 100644
--- a/ppdiffusers/ppdiffusers/utils/__init__.py
+++ b/ppdiffusers/ppdiffusers/utils/__init__.py
@@ -45,6 +45,7 @@
     TO_DIFFUSERS,
     TORCH_SAFETENSORS_WEIGHTS_NAME,
     TORCH_WEIGHTS_NAME,
+    USE_PEFT_BACKEND,
     WEIGHTS_NAME,
     get_map_location_default,
     str2bool,
diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py
index 2e51e9e55..0f292ef03 100644
--- a/ppdiffusers/ppdiffusers/utils/constants.py
+++ b/ppdiffusers/ppdiffusers/utils/constants.py
@@ -82,3 +82,6 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         print(x.tolist())
         print(y.tolist())
         return raw_all_close(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
+
+
+USE_PEFT_BACKEND = False
diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
index 21856f3e6..06a162025 100644
--- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
@@ -225,3 +225,63 @@ def no_init_weights(_enable=True):
             yield
         finally:
             _init_weights = old_init_weights
+
+
+def fourier_filter(x_in, threshold, scale):
+    from paddle.fft import fftn, fftshift, ifftn, ifftshift
+
+    """Fourier filter as introduced in FreeU (https://arxiv.org/abs/2309.11497).
+
+    This version of the method comes from here:
+    https://github.com/huggingface/diffusers/pull/5164#issuecomment-1732638706
+    """
+    x = x_in
+    B, C, H, W = x.shape
+
+    # Non-power of 2 images must be float32
+    if (W & (W - 1)) != 0 or (H & (H - 1)) != 0:
+        x = x.to(dtype=paddle.float32)
+
+    # FFT
+    x_freq = fftn(x, dim=(-2, -1))
+    x_freq = fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = paddle.ones((B, C, H, W), device=x.device)
+
+    crow, ccol = H // 2, W // 2
+    mask[..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold] = scale
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = ifftn(x_freq, dim=(-2, -1)).real
+
+    return x_filtered.to(dtype=x_in.dtype)
+
+
+def apply_freeu(
+    resolution_idx: int, hidden_states: paddle.Tensor, res_hidden_states: paddle.Tensor, **freeu_kwargs
+) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    """Applies the FreeU mechanism as introduced in https:
+    //arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
+
+    Args:
+        resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
+        hidden_states (`paddle.Tensor`): Inputs to the underlying block.
+        res_hidden_states (`paddle.Tensor`): Features from the skip block corresponding to the underlying block.
+        s1 (`float`): Scaling factor for stage 1 to attenuate the contributions of the skip features.
+        s2 (`float`): Scaling factor for stage 2 to attenuate the contributions of the skip features.
+        b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+        b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+    """
+    if resolution_idx == 0:
+        num_half_channels = hidden_states.shape[1] // 2
+        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b1"]
+        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s1"])
+    if resolution_idx == 1:
+        num_half_channels = hidden_states.shape[1] // 2
+        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b2"]
+        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"])
+
+    return hidden_states, res_hidden_states
diff --git a/ppdiffusers/tests/lora/test_lora_layers.py b/ppdiffusers/tests/lora/test_lora_layers.py
new file mode 100644
index 000000000..efce4d51e
--- /dev/null
+++ b/ppdiffusers/tests/lora/test_lora_layers.py
@@ -0,0 +1,1853 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os
+import tempfile
+import time
+import unittest
+
+import numpy as np
+import paddle
+from huggingface_hub.repocard import RepoCard
+from paddlenlp.transformers import (
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+from ppdiffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    EulerDiscreteScheduler,
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+    UNet3DConditionModel,
+)
+from ppdiffusers.loaders import (
+    AttnProcsLayers,
+    LoraLoaderMixin,
+    PatchedLoraProjection,
+    text_encoder_attn_modules,
+)
+from ppdiffusers.models.attention_processor import (  # LoRAXFormersAttnProcessor,
+    Attention,
+    AttnProcessor,
+    AttnProcessor2_5,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_5,
+    XFormersAttnProcessor,
+)
+from ppdiffusers.utils import floats_tensor, is_ppxformers_available
+from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
+
+
+def create_lora_layers(model, mock_weights: bool = True):
+    lora_attn_procs = {}
+    for name in model.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+        if mock_weights:
+            with paddle.no_grad():
+                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
+    return lora_attn_procs
+
+
+def create_unet_lora_layers(unet: paddle.nn.Layer):
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        lora_attn_processor_class = (
+            LoRAAttnProcessor2_5
+            if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
+            else LoRAAttnProcessor
+        )
+        lora_attn_procs[name] = lora_attn_processor_class(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+        )
+    unet_lora_layers = AttnProcsLayers(lora_attn_procs)
+    return lora_attn_procs, unet_lora_layers
+
+
+def create_text_encoder_lora_attn_procs(text_encoder: paddle.nn.Layer):
+    text_lora_attn_procs = {}
+    lora_attn_processor_class = (
+        LoRAAttnProcessor2_5 if hasattr(paddle.nn.functional, "scaled_dot_product_attention_") else LoRAAttnProcessor
+    )
+    for name, module in text_encoder_attn_modules(text_encoder):
+        if isinstance(module.out_proj, paddle.nn.Linear):
+            out_features = module.out_proj.weight.shape[1]
+        elif isinstance(module.out_proj, PatchedLoraProjection):
+            out_features = module.out_proj.regular_linear_layer.weight.shape[1]
+        else:
+            assert False, module.out_proj.__class__
+        text_lora_attn_procs[name] = lora_attn_processor_class(hidden_size=out_features, cross_attention_dim=None)
+    return text_lora_attn_procs
+
+
+def create_text_encoder_lora_layers(text_encoder: paddle.nn.Layer):
+    text_lora_attn_procs = create_text_encoder_lora_attn_procs(text_encoder)
+    text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
+    return text_encoder_lora_layers
+
+
+def create_lora_3d_layers(model, mock_weights: bool = True):
+    lora_attn_procs = {}
+    for name in model.attn_processors.keys():
+        has_cross_attention = name.endswith("attn2.processor") and not (
+            name.startswith("transformer_in") or "temp_attentions" in name.split(".")
+        )
+        cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        elif name.startswith("transformer_in"):
+            # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
+            hidden_size = 8 * model.config.attention_head_dim
+
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+
+        if mock_weights:
+            # add 1 to weights to mock trained weights
+            with paddle.no_grad():
+                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
+    return lora_attn_procs
+
+
+def set_lora_weights(lora_attn_parameters, randn_weight=False, var=1.0):
+    with paddle.no_grad():
+        for parameter in lora_attn_parameters:
+            if randn_weight:
+                parameter[:] = paddle.randn(shape=parameter.shape, dtype=parameter.dtype) * var
+            else:
+                parameter.zero_()
+
+
+def state_dicts_almost_equal(sd1, sd2):
+    sd1 = dict(sorted(sd1.items()))
+    sd2 = dict(sorted(sd2.items()))
+
+    models_are_equal = True
+    for ten1, ten2 in zip(sd1.values(), sd2.values()):
+        if (ten1 - ten2).abs().max() > 1e-3:
+            models_are_equal = False
+
+    return models_are_equal
+
+
+class LoraLoaderMixinTests(unittest.TestCase):
+    def get_dummy_components(self):
+        paddle.Generator().manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+        paddle.Generator().manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
+        text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
+
+        pipeline_components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        lora_components = {
+            "unet_lora_layers": unet_lora_layers,
+            "text_encoder_lora_layers": text_encoder_lora_layers,
+            "unet_lora_attn_procs": unet_lora_attn_procs,
+        }
+        return pipeline_components, lora_components
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = 32, 32
+
+        generator = paddle.Generator().manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
+
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    def get_dummy_tokens(self):
+        max_seq_length = 77
+        inputs = paddle.randint(low=2, high=56, shape=(1, max_seq_length))
+        prepared_inputs = {}
+        prepared_inputs["input_ids"] = inputs
+        return prepared_inputs
+
+    def create_lora_weight_file(self, tmpdirname):
+        _, lora_components = self.get_dummy_components()
+        LoraLoaderMixin.save_lora_weights(
+            save_directory=tmpdirname,
+            unet_lora_layers=lora_components["unet_lora_layers"],
+            text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+        )
+        self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+
+    def test_lora_save_load(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+        _, _, pipeline_inputs = self.get_dummy_inputs()
+        original_images = sd_pipe(**pipeline_inputs).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            LoraLoaderMixin.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+        # Outputs shouldn't match.
+        self.assertFalse(
+            paddle.allclose(
+                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+            ).item()
+        )
+
+    # def test_lora_save_load_safetensors(self):
+    #     pipeline_components, lora_components = self.get_dummy_components()
+    #     sd_pipe = StableDiffusionPipeline(**pipeline_components)
+    #     sd_pipe.set_progress_bar_config(disable=None)
+    #     _, _, pipeline_inputs = self.get_dummy_inputs()
+    #     original_images = sd_pipe(**pipeline_inputs).images
+    #     orig_image_slice = original_images[0, -3:, -3:, -1]
+    #     with tempfile.TemporaryDirectory() as tmpdirname:
+    #         LoraLoaderMixin.save_lora_weights(
+    #             save_directory=tmpdirname,
+    #             unet_lora_layers=lora_components["unet_lora_layers"],
+    #             text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+    #             safe_serialization=True,
+    #         )
+    #         self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+    #         sd_pipe.load_lora_weights(tmpdirname)
+    #     lora_images = sd_pipe(**pipeline_inputs).images
+    #     lora_image_slice = lora_images[0, -3:, -3:, -1]
+    #     # Outputs shouldn't match.
+    #     self.assertFalse(
+    #         paddle.allclose(
+    #             x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+    #         ).item()
+    #     )
+
+    def test_lora_save_load_legacy(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        unet_lora_attn_procs = lora_components["unet_lora_attn_procs"]
+        sd_pipe = StableDiffusionPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+        _, _, pipeline_inputs = self.get_dummy_inputs()
+        original_images = sd_pipe(**pipeline_inputs).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            unet = sd_pipe.unet
+            unet.set_attn_processor(unet_lora_attn_procs)
+            unet.save_attn_procs(tmpdirname)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+        # Outputs shouldn't match.
+        self.assertFalse(
+            paddle.allclose(
+                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+            ).item()
+        )
+
+    def test_text_encoder_lora_monkey_patch(self):
+        pipeline_components, _ = self.get_dummy_components()
+        pipe = StableDiffusionPipeline(**pipeline_components)
+        dummy_tokens = self.get_dummy_tokens()
+        # inference without lora
+        outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
+        assert outputs_without_lora.shape == [1, 77, 32]
+        # monkey patch
+        params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
+        set_lora_weights(params, randn_weight=False)
+        # inference with lora
+        outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
+        assert outputs_with_lora.shape == [1, 77, 32]
+        assert paddle.allclose(
+            x=outputs_without_lora, y=outputs_with_lora
+        ).item(), "lora_up_weight are all zero, so the lora outputs should be the same to without lora outputs"
+        # create lora_attn_procs with randn up.weights
+        create_text_encoder_lora_attn_procs(pipe.text_encoder)
+        # monkey patch
+        params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
+        set_lora_weights(params, randn_weight=True)
+        # inference with lora
+        outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
+        assert outputs_with_lora.shape == [1, 77, 32]
+        assert not paddle.allclose(
+            x=outputs_without_lora, y=outputs_with_lora
+        ).item(), "lora_up_weight are not zero, so the lora outputs should be different to without lora outputs"
+
+    def test_text_encoder_lora_remove_monkey_patch(self):
+        pipeline_components, _ = self.get_dummy_components()
+        pipe = StableDiffusionPipeline(**pipeline_components)
+        dummy_tokens = self.get_dummy_tokens()
+        # inference without lora
+        outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
+        assert outputs_without_lora.shape == [1, 77, 32]
+        # monkey patch
+        params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
+        set_lora_weights(params, randn_weight=True)
+        # inference with lora
+        outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
+        assert outputs_with_lora.shape == [1, 77, 32]
+        assert not paddle.allclose(
+            x=outputs_without_lora, y=outputs_with_lora
+        ).item(), "lora outputs should be different to without lora outputs"
+        # remove monkey patch
+        pipe._remove_text_encoder_monkey_patch()
+        # inference with removed lora
+        outputs_without_lora_removed = pipe.text_encoder(**dummy_tokens)[0]
+        assert outputs_without_lora_removed.shape == [1, 77, 32]
+        assert paddle.allclose(
+            x=outputs_without_lora, y=outputs_without_lora_removed
+        ).item(), "remove lora monkey patch should restore the original outputs"
+
+    def test_text_encoder_lora_scale(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+        _, _, pipeline_inputs = self.get_dummy_inputs()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            LoraLoaderMixin.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+        lora_images_with_scale = sd_pipe(**pipeline_inputs, cross_attention_kwargs={"scale": 0.5}).images
+        lora_image_with_scale_slice = lora_images_with_scale[0, -3:, -3:, -1]
+        # Outputs shouldn't match.
+        self.assertFalse(
+            paddle.allclose(
+                x=paddle.to_tensor(data=lora_image_slice), y=paddle.to_tensor(data=lora_image_with_scale_slice)
+            ).item()
+        )
+
+    def test_lora_unet_attn_processors(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.create_lora_weight_file(tmpdirname)
+            pipeline_components, _ = self.get_dummy_components()
+            sd_pipe = StableDiffusionPipeline(**pipeline_components)
+            sd_pipe.set_progress_bar_config(disable=None)
+            # check if vanilla attention processors are used
+            for _, module in sd_pipe.unet.named_sublayers():
+                if isinstance(module, Attention):
+                    self.assertIsInstance(module.processor, (AttnProcessor, AttnProcessor2_5))
+            # load LoRA weight file
+            sd_pipe.load_lora_weights(tmpdirname)
+            # check if lora attention processors are used
+            for _, module in sd_pipe.unet.named_sublayers():
+                # if isinstance(module, Attention):
+                #     attn_proc_class = (
+                #         LoRAAttnProcessor2_5
+                #         if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
+                #         else LoRAAttnProcessor
+                #     )
+                #     self.assertIsInstance(module.processor, attn_proc_class)
+                if isinstance(module, Attention):
+                    self.assertIsNotNone(module.to_q.lora_layer)
+                    self.assertIsNotNone(module.to_k.lora_layer)
+                    self.assertIsNotNone(module.to_v.lora_layer)
+                    self.assertIsNotNone(module.to_out[0].lora_layer)
+
+    def test_unload_lora_sd(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+        sd_pipe = StableDiffusionPipeline(**pipeline_components)
+        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_lora_layers"].parameters(), randn_weight=True)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            LoraLoaderMixin.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+        # Unload LoRA parameters.
+        sd_pipe.unload_lora_weights()
+        original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
+        assert not np.allclose(
+            orig_image_slice, lora_image_slice
+        ), "LoRA parameters should lead to a different image slice."
+        assert not np.allclose(
+            orig_image_slice_two, lora_image_slice
+        ), "LoRA parameters should lead to a different image slice."
+        assert np.allclose(
+            orig_image_slice, orig_image_slice_two, atol=5e-3
+        ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
+
+    def test_lora_unet_attn_processors_with_xformers(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.create_lora_weight_file(tmpdirname)
+            pipeline_components, _ = self.get_dummy_components()
+            sd_pipe = StableDiffusionPipeline(**pipeline_components)
+            sd_pipe.set_progress_bar_config(disable=None)
+            # enable XFormers
+            sd_pipe.enable_xformers_memory_efficient_attention()
+            # check if xFormers attention processors are used
+            for _, module in sd_pipe.unet.named_sublayers():
+                if isinstance(module, Attention):
+                    self.assertIsInstance(module.processor, XFormersAttnProcessor)
+            # load LoRA weight file
+            sd_pipe.load_lora_weights(tmpdirname)
+            # check if lora attention processors are used
+            for _, module in sd_pipe.unet.named_sublayers():
+                # if isinstance(module, Attention):
+                #     self.assertIsInstance(module.processor, LoRAXFormersAttnProcessor)
+                if isinstance(module, Attention):
+                    self.assertIsNotNone(module.to_q.lora_layer)
+                    self.assertIsNotNone(module.to_k.lora_layer)
+                    self.assertIsNotNone(module.to_v.lora_layer)
+                    self.assertIsNotNone(module.to_out[0].lora_layer)
+            # unload lora weights
+            sd_pipe.unload_lora_weights()
+            # check if attention processors are reverted back to xFormers
+            for _, module in sd_pipe.unet.named_sublayers():
+                if isinstance(module, Attention):
+                    self.assertIsInstance(module.processor, XFormersAttnProcessor)
+
+    def test_lora_save_load_with_xformers(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+        _, _, pipeline_inputs = self.get_dummy_inputs()
+        # enable XFormers
+        sd_pipe.enable_xformers_memory_efficient_attention()
+        original_images = sd_pipe(**pipeline_inputs).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            LoraLoaderMixin.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+        # Outputs shouldn't match.
+        self.assertFalse(
+            paddle.allclose(
+                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+            ).item()
+        )
+
+
+class SDXLLoraLoaderMixinTests(unittest.TestCase):
+    def get_dummy_components(self):
+        paddle.Generator().manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        paddle.Generator().manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        paddle.Generator().manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
+        text_encoder_one_lora_layers = create_text_encoder_lora_layers(text_encoder)
+        text_encoder_two_lora_layers = create_text_encoder_lora_layers(text_encoder_2)
+
+        pipeline_components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer": tokenizer,
+            "tokenizer_2": tokenizer_2,
+        }
+        lora_components = {
+            "unet_lora_layers": unet_lora_layers,
+            "text_encoder_one_lora_layers": text_encoder_one_lora_layers,
+            "text_encoder_two_lora_layers": text_encoder_two_lora_layers,
+            "unet_lora_attn_procs": unet_lora_attn_procs,
+        }
+        return pipeline_components, lora_components
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = paddle.Generator().manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
+
+    def test_lora_save_load(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+        _, _, pipeline_inputs = self.get_dummy_inputs()
+        original_images = sd_pipe(**pipeline_inputs).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+        # Outputs shouldn't match.
+        self.assertFalse(
+            paddle.allclose(
+                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+            ).item()
+        )
+
+    def test_unload_lora_sdxl(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(tmpdirname)
+        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+        # Unload LoRA parameters.
+        sd_pipe.unload_lora_weights()
+        original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
+        assert not np.allclose(
+            orig_image_slice, lora_image_slice
+        ), "LoRA parameters should lead to a different image slice."
+        assert not np.allclose(
+            orig_image_slice_two, lora_image_slice
+        ), "LoRA parameters should lead to a different image slice."
+        assert np.allclose(
+            orig_image_slice, orig_image_slice_two, atol=1e-3
+        ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
+
+    def test_load_lora_locally(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=False,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.unload_lora_weights()
+
+    # There may be only one item in one layer that needs to replace the name with an empty string.
+    def replace_regular(self, keys):
+        keys_new = []
+        for i in keys:
+            keys_new.append(i.replace("regular_linear_layer.", ""))
+        return keys_new
+
+    def test_text_encoder_lora_state_dict_unchanged(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+
+        text_encoder_1_sd_keys = sorted(sd_pipe.text_encoder.state_dict().keys())
+        text_encoder_2_sd_keys = sorted(sd_pipe.text_encoder_2.state_dict().keys())
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=False,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+            text_encoder_1_sd_keys_2 = sorted(self.replace_regular(sd_pipe.text_encoder.state_dict().keys()))
+            text_encoder_2_sd_keys_2 = sorted(self.replace_regular(sd_pipe.text_encoder_2.state_dict().keys()))
+
+        sd_pipe.unload_lora_weights()
+
+        text_encoder_1_sd_keys_3 = sorted(self.replace_regular(sd_pipe.text_encoder.state_dict().keys()))
+        text_encoder_2_sd_keys_3 = sorted(self.replace_regular(sd_pipe.text_encoder_2.state_dict().keys()))
+
+        # default & unloaded LoRA weights should have identical state_dicts
+        assert text_encoder_1_sd_keys == text_encoder_1_sd_keys_3
+        # default & loaded LoRA weights should NOT have identical state_dicts
+        assert text_encoder_1_sd_keys != text_encoder_1_sd_keys_2
+
+        # default & unloaded LoRA weights should have identical state_dicts
+        assert text_encoder_2_sd_keys == text_encoder_2_sd_keys_3
+        # default & loaded LoRA weights should NOT have identical state_dicts
+        assert text_encoder_2_sd_keys != text_encoder_2_sd_keys_2
+
+    def test_load_lora_locally_safetensors(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.unload_lora_weights()
+
+    def test_lora_fusion(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+
+        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.fuse_lora()
+        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+        self.assertFalse(np.allclose(orig_image_slice, lora_image_slice, atol=1e-3))
+
+    def test_unfuse_lora(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+
+        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice = original_images[0, -3:, -3:, -1]
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.fuse_lora()
+        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+        # Reverse LoRA fusion.
+        sd_pipe.unfuse_lora()
+        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        orig_image_slice_two = original_images[0, -3:, -3:, -1]
+
+        assert not np.allclose(
+            orig_image_slice, lora_image_slice
+        ), "Fusion of LoRAs should lead to a different image slice."
+        assert not np.allclose(
+            orig_image_slice_two, lora_image_slice
+        ), "Fusion of LoRAs should lead to a different image slice."
+        assert np.allclose(
+            orig_image_slice, orig_image_slice_two, atol=5e-3
+        ), "Reversing LoRA fusion should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
+
+    def test_lora_fusion_is_not_affected_by_unloading(self):
+        paddle.seed(0)
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+
+        _ = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.fuse_lora()
+        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+        # Unload LoRA parameters.
+        sd_pipe.unload_lora_weights()
+        images_with_unloaded_lora = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        images_with_unloaded_lora_slice = images_with_unloaded_lora[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            lora_image_slice, images_with_unloaded_lora_slice, atol=0.05
+        ), "`unload_lora_weights()` should have not effect on the semantics of the results as the LoRA parameters were fused."
+
+    def test_fuse_lora_with_different_scales(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+
+        _ = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.fuse_lora(lora_scale=1.0)
+        lora_images_scale_one = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice_scale_one = lora_images_scale_one[0, -3:, -3:, -1]
+
+        # Reverse LoRA fusion.
+        sd_pipe.unfuse_lora()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        sd_pipe.fuse_lora(lora_scale=0.5)
+        lora_images_scale_0_5 = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
+
+        assert not np.allclose(
+            lora_image_slice_scale_one, lora_image_slice_scale_0_5, atol=1e-03
+        ), "Different LoRA scales should influence the outputs accordingly."
+
+    def test_with_different_scales(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        original_imagee_slice = original_images[0, -3:, -3:, -1]
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        lora_images_scale_one = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice_scale_one = lora_images_scale_one[0, -3:, -3:, -1]
+
+        lora_images_scale_0_5 = sd_pipe(
+            **pipeline_inputs, generator=paddle.Generator().manual_seed(0), cross_attention_kwargs={"scale": 0.5}
+        ).images
+        lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
+
+        lora_images_scale_0_0 = sd_pipe(
+            **pipeline_inputs, generator=paddle.Generator().manual_seed(0), cross_attention_kwargs={"scale": 0.0}
+        ).images
+        lora_image_slice_scale_0_0 = lora_images_scale_0_0[0, -3:, -3:, -1]
+
+        assert not np.allclose(
+            lora_image_slice_scale_one, lora_image_slice_scale_0_5, atol=1e-03
+        ), "Different LoRA scales should influence the outputs accordingly."
+
+        assert np.allclose(
+            original_imagee_slice, lora_image_slice_scale_0_0, atol=1e-03
+        ), "LoRA scale of 0.0 shouldn't be different from the results without LoRA."
+
+    def test_with_different_scales_fusion_equivalence(self):
+        pipeline_components, lora_components = self.get_dummy_components()
+        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+        # sd_pipe.unet.set_default_attn_processor()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+
+        images = sd_pipe(
+            **pipeline_inputs,
+            generator=paddle.Generator().manual_seed(0),
+        ).images
+        images_slice = images[0, -3:, -3:, -1]
+
+        # Emulate training.
+        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True, var=0.1)
+        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True, var=0.1)
+        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True, var=0.1)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            StableDiffusionXLPipeline.save_lora_weights(
+                save_directory=tmpdirname,
+                unet_lora_layers=lora_components["unet_lora_layers"],
+                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+                safe_serialization=True,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            sd_pipe.load_lora_weights(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))
+
+        lora_images_scale_0_5 = sd_pipe(
+            **pipeline_inputs,
+            generator=paddle.Generator().manual_seed(0),
+            cross_attention_kwargs={"scale": 0.5},
+        ).images
+        lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
+
+        sd_pipe.fuse_lora(lora_scale=0.5)
+        lora_images_scale_0_5_fusion = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        lora_image_slice_scale_0_5_fusion = lora_images_scale_0_5_fusion[0, -3:, -3:, -1]
+
+        assert np.allclose(
+            lora_image_slice_scale_0_5, lora_image_slice_scale_0_5_fusion, atol=5e-03
+        ), "Fusion shouldn't affect the results when calling the pipeline with a non-default LoRA scale."
+
+        sd_pipe.unfuse_lora()
+        images_unfused = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+        images_slice_unfused = images_unfused[0, -3:, -3:, -1]
+
+        assert np.allclose(images_slice, images_slice_unfused, atol=5e-03), "Unfused should match no LoRA"
+
+        assert not np.allclose(
+            images_slice, lora_image_slice_scale_0_5, atol=5e-03
+        ), "0.5 scale and no scale shouldn't match"
+
+
+# @deprecate_after_peft_backend
+class UNet2DConditionLoRAModelTests(unittest.TestCase):
+    model_class = UNet2DConditionModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        time_step = paddle.to_tensor([10])
+        encoder_hidden_states = floats_tensor((batch_size, 4, 32))
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (32, 64),
+            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
+            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
+            "cross_attention_dim": 32,
+            "attention_head_dim": 8,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 2,
+            "sample_size": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_lora_processors(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["attention_head_dim"] = 8, 16
+        model = self.model_class(**init_dict)
+        with paddle.no_grad():
+            sample1 = model(**inputs_dict).sample
+        lora_attn_procs = create_lora_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+        model.set_attn_processor(model.attn_processors)
+        with paddle.no_grad():
+            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+        assert (sample1 - sample2).abs().max() < 3e-3
+        assert (sample3 - sample4).abs().max() < 3e-3
+        assert (sample2 - sample3).abs().max() > 3e-3
+
+    def test_lora_save_load(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["attention_head_dim"] = 8, 16
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+        with paddle.no_grad():
+            old_sample = model(**inputs_dict).sample
+        lora_attn_procs = create_lora_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+        with paddle.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname, to_diffusers=False)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            paddle.seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.load_attn_procs(tmpdirname, from_diffusers=False)
+
+        with paddle.no_grad():
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        assert (sample - new_sample).abs().max() < 1e-4
+
+        # LoRA and no LoRA should NOT be the same
+        assert (sample - old_sample).abs().max() > 1e-4
+
+    def test_lora_save_load_safetensors(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+
+        with paddle.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        lora_attn_procs = create_lora_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+
+        with paddle.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+            paddle.seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
+        with paddle.no_grad():
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+        assert (sample - new_sample).abs().max() < 0.0001
+        assert (sample - old_sample).abs().max() > 0.0001
+
+    # def test_lora_save_safetensors_load_torch(self):
+    #     # enable deterministic behavior for gradient checkpointing
+    #     init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+    #     init_dict["attention_head_dim"] = (8, 16)
+
+    #     paddle.seed(0)
+    #     model = self.model_class(**init_dict)
+
+    #     lora_attn_procs = create_lora_layers(model, mock_weights=False)
+    #     model.set_attn_processor(lora_attn_procs)
+    #     # Saving as paddle, properly reloads with directly filename
+    #     with tempfile.TemporaryDirectory() as tmpdirname:
+    #         model.save_attn_procs(tmpdirname, to_diffusers=True)
+    #         self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+    #         paddle.seed(0)
+    #         new_model = self.model_class(**init_dict)
+    #         new_model.load_attn_procs(
+    #             tmpdirname, weight_name="pytorch_lora_weights.bin", from_diffusers=True, use_safetensors=False
+    #         )
+
+    def test_lora_save_torch_force_load_safetensors_error(self):
+        pass
+
+    def test_lora_on_off(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["attention_head_dim"] = 8, 16
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+        with paddle.no_grad():
+            old_sample = model(**inputs_dict).sample
+        lora_attn_procs = create_lora_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+        with paddle.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+        model.set_default_attn_processor()
+        with paddle.no_grad():
+            new_sample = model(**inputs_dict).sample
+        assert (sample - new_sample).abs().max() < 0.0001
+        assert (sample - old_sample).abs().max() < 3e-3
+
+    @unittest.skipIf(
+        not is_ppxformers_available(),
+        reason="scaled_dot_product_attention attention is only available with CUDA and `scaled_dot_product_attention` installed",
+    )
+    def test_lora_xformers_on_off(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["attention_head_dim"] = 8, 16
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+        lora_attn_procs = create_lora_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+        with paddle.no_grad():
+            sample = model(**inputs_dict).sample
+            model.enable_xformers_memory_efficient_attention()
+            on_sample = model(**inputs_dict).sample
+            model.disable_xformers_memory_efficient_attention()
+            off_sample = model(**inputs_dict).sample
+        assert (sample - on_sample).abs().max() < 0.05
+        assert (sample - off_sample).abs().max() < 0.05
+
+
+# @deprecate_after_peft_backend
+class UNet3DConditionModelTests(unittest.TestCase):
+    model_class = UNet3DConditionModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        num_frames = 4
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels, num_frames) + sizes)
+        time_step = paddle.to_tensor([10])
+        encoder_hidden_states = floats_tensor((batch_size, 4, 32))
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    @property
+    def input_shape(self):
+        return (4, 4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (32, 64),
+            "down_block_types": (
+                "CrossAttnDownBlock3D",
+                "DownBlock3D",
+            ),
+            "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
+            "cross_attention_dim": 32,
+            "attention_head_dim": 8,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 1,
+            "sample_size": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_lora_processors(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = 8
+
+        model = self.model_class(**init_dict)
+
+        with paddle.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        lora_attn_procs = create_lora_3d_layers(model)
+
+        # make sure we can set a list of attention processors
+        model.set_attn_processor(lora_attn_procs)
+
+        # test that attn processors can be set to itself
+        model.set_attn_processor(model.attn_processors)
+
+        with paddle.no_grad():
+            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        assert (sample1 - sample2).abs().max() < 3e-3
+        assert (sample3 - sample4).abs().max() < 3e-3
+
+        # sample 2 and sample 3 should be different
+        assert (sample2 - sample3).abs().max() > 3e-3
+
+    def test_lora_save_load(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = 8
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+
+        with paddle.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        lora_attn_procs = create_lora_3d_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+
+        with paddle.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(
+                tmpdirname,
+                to_diffusers=False,
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+            paddle.seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.load_attn_procs(tmpdirname, from_diffusers=False)
+
+        with paddle.no_grad():
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        assert (sample - new_sample).abs().max() < 1e-3
+
+        # LoRA and no LoRA should NOT be the same
+        assert (sample - old_sample).abs().max() > 1e-4
+
+    def test_lora_save_load_safetensors(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = 8
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+
+        with paddle.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        lora_attn_procs = create_lora_3d_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+
+        with paddle.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+            paddle.seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
+
+        with paddle.no_grad():
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        assert (sample - new_sample).abs().max() < 3e-3
+
+        # LoRA and no LoRA should NOT be the same
+        assert (sample - old_sample).abs().max() > 1e-4
+
+    # def test_lora_save_safetensors_load_torch(self):
+    #     # enable deterministic behavior for gradient checkpointing
+    #     init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+    #     init_dict["attention_head_dim"] = 8
+
+    #     paddle.seed(0)
+    #     model = self.model_class(**init_dict)
+
+    #     lora_attn_procs = create_lora_layers(model, mock_weights=False)
+    #     model.set_attn_processor(lora_attn_procs)
+    #     # Saving as paddle, properly reloads with directly filename
+    #     with tempfile.TemporaryDirectory() as tmpdirname:
+    #         model.save_attn_procs(tmpdirname, to_diffusers=True)
+    #         self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+    #         paddle.seed(0)
+    #         new_model = self.model_class(**init_dict)
+    #         new_model.load_attn_procs(
+    #             tmpdirname, weight_name="pytorch_lora_weights.bin", use_safetensors=False, from_diffusers=True
+    #         )
+
+    def test_lora_save_paddle_force_load_safetensors_error(self):
+        pass
+
+    def test_lora_on_off(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = 8
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+
+        with paddle.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        lora_attn_procs = create_lora_3d_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+
+        with paddle.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+
+        model.set_attn_processor(AttnProcessor())
+
+        with paddle.no_grad():
+            new_sample = model(**inputs_dict).sample
+
+        assert (sample - new_sample).abs().max() < 1e-4
+        assert (sample - old_sample).abs().max() < 3e-3
+
+    @unittest.skipIf(
+        not is_ppxformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_lora_xformers_on_off(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = 4
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+        lora_attn_procs = create_lora_3d_layers(model)
+        model.set_attn_processor(lora_attn_procs)
+
+        # default
+        with paddle.no_grad():
+            sample = model(**inputs_dict).sample
+
+            model.enable_xformers_memory_efficient_attention()
+            on_sample = model(**inputs_dict).sample
+
+            model.disable_xformers_memory_efficient_attention()
+            off_sample = model(**inputs_dict).sample
+
+        assert (sample - on_sample).abs().max() < 0.005
+        assert (sample - off_sample).abs().max() < 0.005
+
+
+@slow
+@require_paddle_gpu
+class LoraIntegrationTests(unittest.TestCase):
+    def test_dreambooth_old_format(self):
+        generator = paddle.Generator().manual_seed().manual_seed(0)
+        lora_model_id = "hf-internal-testing/lora_dreambooth_dog_example"
+        card = RepoCard.load(lora_model_id)
+        base_model_id = card.data.to_dict()["base_model"]
+        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+        pipe.load_lora_weights(lora_model_id)
+        images = pipe(
+            "A photo of a sks dog floating in the river", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.7207, 0.6787, 0.601, 0.7478, 0.6838, 0.6064, 0.6984, 0.6443, 0.5785])
+        self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+    def test_dreambooth_text_encoder_new_format(self):
+        generator = paddle.Generator().manual_seed().manual_seed(0)
+        lora_model_id = "hf-internal-testing/lora-trained"
+        card = RepoCard.load(lora_model_id)
+        base_model_id = card.data.to_dict()["base_model"]
+        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+        pipe.load_lora_weights(lora_model_id)
+        images = pipe("A photo of a sks dog", output_type="np", generator=generator, num_inference_steps=2).images
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.6628, 0.6138, 0.539, 0.6625, 0.613, 0.5463, 0.6166, 0.5788, 0.5359])
+        self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+    def test_a1111(self):
+        generator = paddle.Generator().manual_seed().manual_seed(0)
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3725, 0.3767, 0.3761, 0.3796, 0.3827, 0.3763, 0.3831, 0.3809, 0.3392])
+        self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+    def test_a1111_with_model_cpu_offload(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_a1111_with_sequential_cpu_offload(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
+        pipe.enable_sequential_cpu_offload()
+        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+        lora_filename = "light_and_shadow.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_kohya_sd_v15_with_higher_dimensions(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        lora_model_id = "hf-internal-testing/urushisato-lora"
+        lora_filename = "urushisato_v15.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.7165, 0.6616, 0.5833, 0.7504, 0.6718, 0.587, 0.6871, 0.6361, 0.5694])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_vanilla_funetuning(self):
+        generator = paddle.Generator().manual_seed().manual_seed(0)
+        lora_model_id = "hf-internal-testing/sd-model-finetuned-lora-t4"
+        card = RepoCard.load(lora_model_id)
+        base_model_id = card.data.to_dict()["base_model"]
+        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+        pipe.load_lora_weights(lora_model_id)
+        images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=2).images
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.7406, 0.699, 0.5963, 0.7493, 0.7045, 0.6096, 0.6886, 0.6388, 0.583])
+        self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+    def test_unload_kohya_lora(self):
+        generator = paddle.Generator().manual_seed(0)
+        prompt = "masterpiece, best quality, mountain"
+        num_inference_steps = 2
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        initial_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        initial_images = initial_images[0, -3:, -3:, -1].flatten()
+        lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
+        lora_filename = "Colored_Icons_by_vizsumit.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        generator = paddle.Generator().manual_seed(0)
+        lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        lora_images = lora_images[0, -3:, -3:, -1].flatten()
+        pipe.unload_lora_weights()
+        generator = paddle.Generator().manual_seed(0)
+        unloaded_lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
+        self.assertFalse(np.allclose(initial_images, lora_images))
+        self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
+
+    def test_load_unload_load_kohya_lora(self):
+        # This test ensures that a Kohya-style LoRA can be safely unloaded and then loaded
+        # without introducing any side-effects. Even though the test uses a Kohya-style
+        # LoRA, the underlying adapter handling mechanism is format-agnostic.
+        generator = paddle.Generator().manual_seed(0)
+        prompt = "masterpiece, best quality, mountain"
+        num_inference_steps = 2
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        initial_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        initial_images = initial_images[0, -3:, -3:, -1].flatten()
+
+        lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
+        lora_filename = "Colored_Icons_by_vizsumit.safetensors"
+
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        generator = paddle.Generator().manual_seed(0)
+        lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        lora_images = lora_images[0, -3:, -3:, -1].flatten()
+
+        pipe.unload_lora_weights()
+        generator = paddle.Generator().manual_seed(0)
+        unloaded_lora_images = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
+
+        self.assertFalse(np.allclose(initial_images, lora_images))
+        self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
+        # make sure we can load a LoRA again after unloading and they don't have
+        # any undesired effects.
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        generator = paddle.Generator().manual_seed(0)
+        lora_images_again = pipe(
+            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+        ).images
+        lora_images_again = lora_images_again[0, -3:, -3:, -1].flatten()
+        self.assertTrue(np.allclose(lora_images, lora_images_again, atol=0.001))
+
+    def test_sdxl_0_9_lora_one(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
+        lora_model_id = "hf-internal-testing/sdxl-0.9-daiton-lora"
+        lora_filename = "daiton-xl-lora-test.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3838, 0.3482, 0.3588, 0.3162, 0.319, 0.3369, 0.338, 0.3366, 0.3213])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_sdxl_0_9_lora_two(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
+        lora_model_id = "hf-internal-testing/sdxl-0.9-costumes-lora"
+        lora_filename = "saijo.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.3137, 0.3269, 0.3355, 0.255, 0.2577, 0.2563, 0.2679, 0.2758, 0.2626])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_sdxl_0_9_lora_three(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
+        lora_model_id = "hf-internal-testing/sdxl-0.9-kamepan-lora"
+        lora_filename = "kame_sdxl_v2-000020-16rank.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4015, 0.3761, 0.3616, 0.3745, 0.3462, 0.3337, 0.3564, 0.3649, 0.3468])
+
+        self.assertTrue(np.allclose(images, expected, atol=5e-3))
+
+    def test_sdxl_1_0_lora(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-4))
+
+    def test_sdxl_1_0_lora_fusion(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.fuse_lora()
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        # This way we also test equivalence between LoRA fusion and the non-fusion behaviour.
+        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-4))
+
+    def test_sdxl_1_0_lora_unfusion(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.fuse_lora()
+        pipe.enable_model_cpu_offload()
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        images_with_fusion = images[0, -3:, -3:, -1].flatten()
+
+        pipe.unfuse_lora()
+        generator = paddle.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        images_without_fusion = images[0, -3:, -3:, -1].flatten()
+
+        self.assertFalse(np.allclose(images_with_fusion, images_without_fusion, atol=1e-3))
+
+    def test_sdxl_1_0_lora_unfusion_effectivity(self):
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+
+        generator = paddle.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        original_image_slice = images[0, -3:, -3:, -1].flatten()
+
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.fuse_lora()
+
+        generator = paddle.Generator().manual_seed(0)
+        _ = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        pipe.unfuse_lora()
+        generator = paddle.Generator().manual_seed(0)
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+        images_without_fusion_slice = images[0, -3:, -3:, -1].flatten()
+
+        self.assertTrue(np.allclose(original_image_slice, images_without_fusion_slice, atol=1e-3))
+
+    def test_sdxl_1_0_lora_fusion_efficiency(self):
+        generator = paddle.Generator().manual_seed(0)
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.enable_model_cpu_offload()
+
+        start_time = time.time()
+        for _ in range(3):
+            pipe(
+                "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+            ).images
+        end_time = time.time()
+        elapsed_time_non_fusion = end_time - start_time
+
+        del pipe
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+        pipe.fuse_lora()
+        pipe.enable_model_cpu_offload()
+
+        start_time = time.time()
+        generator = paddle.Generator().manual_seed(0)
+        for _ in range(3):
+            pipe(
+                "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+            ).images
+        end_time = time.time()
+        elapsed_time_fusion = end_time - start_time
+
+        self.assertTrue(elapsed_time_fusion < elapsed_time_non_fusion)
+
+    def test_sdxl_1_0_last_ben(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_model_cpu_offload()
+        lora_model_id = "TheLastBen/Papercut_SDXL"
+        lora_filename = "papercut.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe("papercut.safetensors", output_type="np", generator=generator, num_inference_steps=2).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.5244, 0.4347, 0.4312, 0.4246, 0.4398, 0.4409, 0.4884, 0.4938, 0.4094])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
+
+    def test_sdxl_1_0_fuse_unfuse_all(self):
+        pipe = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16
+        )
+        text_encoder_1_sd = copy.deepcopy(pipe.text_encoder.state_dict())
+        text_encoder_2_sd = copy.deepcopy(pipe.text_encoder_2.state_dict())
+        unet_sd = copy.deepcopy(pipe.unet.state_dict())
+
+        pipe.load_lora_weights(
+            "davizca87/sun-flower", weight_name="snfw3rXL-000004.safetensors", paddle_dtype=paddle.float16
+        )
+        pipe.fuse_lora()
+        pipe.unload_lora_weights()
+        pipe.unfuse_lora()
+
+        assert state_dicts_almost_equal(text_encoder_1_sd, pipe.text_encoder.state_dict())
+        assert state_dicts_almost_equal(text_encoder_2_sd, pipe.text_encoder_2.state_dict())
+        assert state_dicts_almost_equal(unet_sd, pipe.unet.state_dict())
+
+    def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self):
+        generator = paddle.Generator().manual_seed(0)
+
+        pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        pipe.enable_sequential_cpu_offload()
+        lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
+        lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
+        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
+
+        images = pipe(
+            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+        ).images
+
+        images = images[0, -3:, -3:, -1].flatten()
+        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+
+        self.assertTrue(np.allclose(images, expected, atol=1e-3))
diff --git a/ppdiffusers/tests/models/test_activations.py b/ppdiffusers/tests/models/test_activations.py
new file mode 100644
index 000000000..a5d9c2412
--- /dev/null
+++ b/ppdiffusers/tests/models/test_activations.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle import nn
+
+from ppdiffusers.models.activations import get_activation
+
+
+class ActivationsTests(unittest.TestCase):
+    def test_swish(self):
+        act = get_activation("swish")
+
+        self.assertIsInstance(act, nn.Silu)
+
+        self.assertEqual(act(paddle.to_tensor(-100, dtype=paddle.float32)).item(), 0)
+        self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
+
+    def test_silu(self):
+        act = get_activation("silu")
+
+        self.assertIsInstance(act, nn.Silu)
+
+        self.assertEqual(act(paddle.to_tensor(-100, dtype=paddle.float32)).item(), 0)
+        self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
+
+    def test_mish(self):
+        act = get_activation("mish")
+
+        self.assertIsInstance(act, nn.Mish)
+
+        self.assertEqual(act(paddle.to_tensor(-200, dtype=paddle.float32)).item(), 0)
+        self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
+
+    def test_gelu(self):
+        act = get_activation("gelu")
+
+        self.assertIsInstance(act, nn.GELU)
+
+        self.assertEqual(act(paddle.to_tensor(-100, dtype=paddle.float32)).item(), 0)
+        self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
+        self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
diff --git a/ppdiffusers/tests/models/test_lora_layers.py b/ppdiffusers/tests/models/test_lora_layers.py
deleted file mode 100644
index c297f80b8..000000000
--- a/ppdiffusers/tests/models/test_lora_layers.py
+++ /dev/null
@@ -1,735 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import paddle
-from huggingface_hub.repocard import RepoCard
-from paddlenlp.transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-)
-
-from ppdiffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerDiscreteScheduler,
-    StableDiffusionPipeline,
-    StableDiffusionXLPipeline,
-    UNet2DConditionModel,
-)
-from ppdiffusers.loaders import (
-    AttnProcsLayers,
-    LoraLoaderMixin,
-    PatchedLoraProjection,
-    text_encoder_attn_modules,
-)
-from ppdiffusers.models.attention_processor import (
-    Attention,
-    AttnProcessor,
-    AttnProcessor2_5,
-    LoRAAttnProcessor,
-    LoRAAttnProcessor2_5,
-    LoRAXFormersAttnProcessor,
-    XFormersAttnProcessor,
-)
-from ppdiffusers.utils import floats_tensor
-from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-
-
-def create_unet_lora_layers(unet: paddle.nn.Layer):
-    lora_attn_procs = {}
-    for name in unet.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = unet.config.block_out_channels[block_id]
-        lora_attn_processor_class = (
-            LoRAAttnProcessor2_5
-            if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
-            else LoRAAttnProcessor
-        )
-        lora_attn_procs[name] = lora_attn_processor_class(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
-        )
-    unet_lora_layers = AttnProcsLayers(lora_attn_procs)
-    return lora_attn_procs, unet_lora_layers
-
-
-def create_text_encoder_lora_attn_procs(text_encoder: paddle.nn.Layer):
-    text_lora_attn_procs = {}
-    lora_attn_processor_class = (
-        LoRAAttnProcessor2_5 if hasattr(paddle.nn.functional, "scaled_dot_product_attention_") else LoRAAttnProcessor
-    )
-    for name, module in text_encoder_attn_modules(text_encoder):
-        if isinstance(module.out_proj, paddle.nn.Linear):
-            out_features = module.out_proj.weight.shape[1]
-        elif isinstance(module.out_proj, PatchedLoraProjection):
-            out_features = module.out_proj.regular_linear_layer.weight.shape[1]
-        else:
-            assert False, module.out_proj.__class__
-        text_lora_attn_procs[name] = lora_attn_processor_class(hidden_size=out_features, cross_attention_dim=None)
-    return text_lora_attn_procs
-
-
-def create_text_encoder_lora_layers(text_encoder: paddle.nn.Layer):
-    text_lora_attn_procs = create_text_encoder_lora_attn_procs(text_encoder)
-    text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
-    return text_encoder_lora_layers
-
-
-def set_lora_weights(lora_attn_parameters, randn_weight=False):
-    with paddle.no_grad():
-        for parameter in lora_attn_parameters:
-            if randn_weight:
-                parameter[:] = paddle.randn(shape=parameter.shape, dtype=parameter.dtype)
-            else:
-                parameter.zero_()
-
-
-class LoraLoaderMixinTests(unittest.TestCase):
-    def get_dummy_components(self):
-        paddle.Generator().manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-        paddle.Generator().manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
-        text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
-        pipeline_components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        lora_components = {
-            "unet_lora_layers": unet_lora_layers,
-            "text_encoder_lora_layers": text_encoder_lora_layers,
-            "unet_lora_attn_procs": unet_lora_attn_procs,
-        }
-        return pipeline_components, lora_components
-
-    def get_dummy_inputs(self, with_generator=True):
-        batch_size = 1
-        sequence_length = 10
-        num_channels = 4
-        sizes = 32, 32
-        generator = paddle.Generator().manual_seed(0)
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
-        pipeline_inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-        }
-        if with_generator:
-            pipeline_inputs.update({"generator": generator})
-        return noise, input_ids, pipeline_inputs
-
-    def get_dummy_tokens(self):
-        max_seq_length = 77
-        inputs = paddle.randint(low=2, high=56, shape=(1, max_seq_length))
-        prepared_inputs = {}
-        prepared_inputs["input_ids"] = inputs
-        return prepared_inputs
-
-    def create_lora_weight_file(self, tmpdirname):
-        _, lora_components = self.get_dummy_components()
-        LoraLoaderMixin.save_lora_weights(
-            save_directory=tmpdirname,
-            unet_lora_layers=lora_components["unet_lora_layers"],
-            text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-        )
-        self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-
-    def test_lora_save_load(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        _, _, pipeline_inputs = self.get_dummy_inputs()
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-
-        # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
-            ).item()
-        )
-
-    # def test_lora_save_load_safetensors(self):
-    #     pipeline_components, lora_components = self.get_dummy_components()
-    #     sd_pipe = StableDiffusionPipeline(**pipeline_components)
-    #     sd_pipe.set_progress_bar_config(disable=None)
-    #     _, _, pipeline_inputs = self.get_dummy_inputs()
-    #     original_images = sd_pipe(**pipeline_inputs).images
-    #     orig_image_slice = original_images[0, -3:, -3:, -1]
-    #     with tempfile.TemporaryDirectory() as tmpdirname:
-    #         LoraLoaderMixin.save_lora_weights(
-    #             save_directory=tmpdirname,
-    #             unet_lora_layers=lora_components["unet_lora_layers"],
-    #             text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-    #             safe_serialization=True,
-    #         )
-    #         self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-    #         sd_pipe.load_lora_weights(tmpdirname)
-    #     lora_images = sd_pipe(**pipeline_inputs).images
-    #     lora_image_slice = lora_images[0, -3:, -3:, -1]
-    #     # Outputs shouldn't match.
-    #     self.assertFalse(
-    #         paddle.allclose(
-    #             x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
-    #         ).item()
-    #     )
-
-    def test_lora_save_load_legacy(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        unet_lora_attn_procs = lora_components["unet_lora_attn_procs"]
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        _, _, pipeline_inputs = self.get_dummy_inputs()
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            unet = sd_pipe.unet
-            unet.set_attn_processor(unet_lora_attn_procs)
-            unet.save_attn_procs(tmpdirname)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-
-        # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
-            ).item()
-        )
-
-    def test_text_encoder_lora_monkey_patch(self):
-        pipeline_components, _ = self.get_dummy_components()
-        pipe = StableDiffusionPipeline(**pipeline_components)
-        dummy_tokens = self.get_dummy_tokens()
-        # inference without lora
-        outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
-        assert outputs_without_lora.shape == [1, 77, 32]
-        # monkey patch
-        params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
-        set_lora_weights(params, randn_weight=False)
-        # inference with lora
-        outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
-        assert outputs_with_lora.shape == [1, 77, 32]
-        assert paddle.allclose(
-            x=outputs_without_lora, y=outputs_with_lora
-        ).item(), "lora_up_weight are all zero, so the lora outputs should be the same to without lora outputs"
-        # create lora_attn_procs with randn up.weights
-        create_text_encoder_lora_attn_procs(pipe.text_encoder)
-        # monkey patch
-        params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
-        set_lora_weights(params, randn_weight=True)
-        # inference with lora
-        outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
-        assert outputs_with_lora.shape == [1, 77, 32]
-        assert not paddle.allclose(
-            x=outputs_without_lora, y=outputs_with_lora
-        ).item(), "lora_up_weight are not zero, so the lora outputs should be different to without lora outputs"
-
-    def test_text_encoder_lora_remove_monkey_patch(self):
-        pipeline_components, _ = self.get_dummy_components()
-        pipe = StableDiffusionPipeline(**pipeline_components)
-        dummy_tokens = self.get_dummy_tokens()
-        # inference without lora
-        outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
-        assert outputs_without_lora.shape == [1, 77, 32]
-        # monkey patch
-        params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
-        set_lora_weights(params, randn_weight=True)
-        # inference with lora
-        outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
-        assert outputs_with_lora.shape == [1, 77, 32]
-        assert not paddle.allclose(
-            x=outputs_without_lora, y=outputs_with_lora
-        ).item(), "lora outputs should be different to without lora outputs"
-        # remove monkey patch
-        pipe._remove_text_encoder_monkey_patch()
-        # inference with removed lora
-        outputs_without_lora_removed = pipe.text_encoder(**dummy_tokens)[0]
-        assert outputs_without_lora_removed.shape == [1, 77, 32]
-        assert paddle.allclose(
-            x=outputs_without_lora, y=outputs_without_lora_removed
-        ).item(), "remove lora monkey patch should restore the original outputs"
-
-    def test_text_encoder_lora_scale(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        _, _, pipeline_inputs = self.get_dummy_inputs()
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-        lora_images_with_scale = sd_pipe(**pipeline_inputs, cross_attention_kwargs={"scale": 0.5}).images
-        lora_image_with_scale_slice = lora_images_with_scale[0, -3:, -3:, -1]
-        # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                x=paddle.to_tensor(data=lora_image_slice), y=paddle.to_tensor(data=lora_image_with_scale_slice)
-            ).item()
-        )
-
-    def test_lora_unet_attn_processors(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            self.create_lora_weight_file(tmpdirname)
-            pipeline_components, _ = self.get_dummy_components()
-            sd_pipe = StableDiffusionPipeline(**pipeline_components)
-            sd_pipe.set_progress_bar_config(disable=None)
-            # check if vanilla attention processors are used
-            for _, module in sd_pipe.unet.named_sublayers():
-                if isinstance(module, Attention):
-                    self.assertIsInstance(module.processor, (AttnProcessor, AttnProcessor2_5))
-            # load LoRA weight file
-            sd_pipe.load_lora_weights(tmpdirname)
-            # check if lora attention processors are used
-            for _, module in sd_pipe.unet.named_sublayers():
-                if isinstance(module, Attention):
-                    attn_proc_class = (
-                        LoRAAttnProcessor2_5
-                        if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
-                        else LoRAAttnProcessor
-                    )
-                    self.assertIsInstance(module.processor, attn_proc_class)
-
-    def test_unload_lora_sd(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-        # Emulate training.
-        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
-        set_lora_weights(lora_components["text_encoder_lora_layers"].parameters(), randn_weight=True)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-        # Unload LoRA parameters.
-        sd_pipe.unload_lora_weights()
-        original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-        orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
-        assert not np.allclose(
-            orig_image_slice, lora_image_slice
-        ), "LoRA parameters should lead to a different image slice."
-        assert not np.allclose(
-            orig_image_slice_two, lora_image_slice
-        ), "LoRA parameters should lead to a different image slice."
-        assert np.allclose(
-            orig_image_slice, orig_image_slice_two, atol=0.001
-        ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
-
-    def test_lora_unet_attn_processors_with_xformers(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            self.create_lora_weight_file(tmpdirname)
-            pipeline_components, _ = self.get_dummy_components()
-            sd_pipe = StableDiffusionPipeline(**pipeline_components)
-            sd_pipe.set_progress_bar_config(disable=None)
-            # enable XFormers
-            sd_pipe.enable_xformers_memory_efficient_attention()
-            # check if xFormers attention processors are used
-            for _, module in sd_pipe.unet.named_sublayers():
-                if isinstance(module, Attention):
-                    self.assertIsInstance(module.processor, XFormersAttnProcessor)
-            # load LoRA weight file
-            sd_pipe.load_lora_weights(tmpdirname)
-            # check if lora attention processors are used
-            for _, module in sd_pipe.unet.named_sublayers():
-                if isinstance(module, Attention):
-                    self.assertIsInstance(module.processor, LoRAXFormersAttnProcessor)
-            # unload lora weights
-            sd_pipe.unload_lora_weights()
-            # check if attention processors are reverted back to xFormers
-            for _, module in sd_pipe.unet.named_sublayers():
-                if isinstance(module, Attention):
-                    self.assertIsInstance(module.processor, XFormersAttnProcessor)
-
-    def test_lora_save_load_with_xformers(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        _, _, pipeline_inputs = self.get_dummy_inputs()
-        # enable XFormers
-        sd_pipe.enable_xformers_memory_efficient_attention()
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-        # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
-            ).item()
-        )
-
-
-class SDXLLoraLoaderMixinTests(unittest.TestCase):
-    def get_dummy_components(self):
-        paddle.Generator().manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-            addition_embed_type="text_time",
-            addition_time_embed_dim=8,
-            transformer_layers_per_block=(1, 2),
-            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
-            cross_attention_dim=64,
-        )
-        scheduler = EulerDiscreteScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            steps_offset=1,
-            beta_schedule="scaled_linear",
-            timestep_spacing="leading",
-        )
-        paddle.Generator().manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        paddle.Generator().manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
-        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
-        text_encoder_one_lora_layers = create_text_encoder_lora_layers(text_encoder)
-        text_encoder_two_lora_layers = create_text_encoder_lora_layers(text_encoder_2)
-        pipeline_components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "text_encoder_2": text_encoder_2,
-            "tokenizer": tokenizer,
-            "tokenizer_2": tokenizer_2,
-        }
-        lora_components = {
-            "unet_lora_layers": unet_lora_layers,
-            "text_encoder_one_lora_layers": text_encoder_one_lora_layers,
-            "text_encoder_two_lora_layers": text_encoder_two_lora_layers,
-            "unet_lora_attn_procs": unet_lora_attn_procs,
-        }
-        return pipeline_components, lora_components
-
-    def get_dummy_inputs(self, with_generator=True):
-        batch_size = 1
-        sequence_length = 10
-        num_channels = 4
-        sizes = 32, 32
-        generator = paddle.Generator().manual_seed(0)
-        noise = floats_tensor((batch_size, num_channels) + sizes)
-        input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
-        pipeline_inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "np",
-        }
-        if with_generator:
-            pipeline_inputs.update({"generator": generator})
-        return noise, input_ids, pipeline_inputs
-
-    def test_lora_save_load(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
-        sd_pipe.set_progress_bar_config(disable=None)
-        _, _, pipeline_inputs = self.get_dummy_inputs()
-        original_images = sd_pipe(**pipeline_inputs).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            StableDiffusionXLPipeline.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
-                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-        # Outputs shouldn't match.
-        self.assertFalse(
-            paddle.allclose(
-                x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
-            ).item()
-        )
-
-    def test_unload_lora_sdxl(self):
-        pipeline_components, lora_components = self.get_dummy_components()
-        _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-        sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
-        original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-        orig_image_slice = original_images[0, -3:, -3:, -1]
-        # Emulate training.
-        set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
-        set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
-        set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            StableDiffusionXLPipeline.save_lora_weights(
-                save_directory=tmpdirname,
-                unet_lora_layers=lora_components["unet_lora_layers"],
-                text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
-                text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            sd_pipe.load_lora_weights(tmpdirname)
-        lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-        lora_image_slice = lora_images[0, -3:, -3:, -1]
-        # Unload LoRA parameters.
-        sd_pipe.unload_lora_weights()
-        original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-        orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
-        assert not np.allclose(
-            orig_image_slice, lora_image_slice
-        ), "LoRA parameters should lead to a different image slice."
-        assert not np.allclose(
-            orig_image_slice_two, lora_image_slice
-        ), "LoRA parameters should lead to a different image slice."
-        assert np.allclose(
-            orig_image_slice, orig_image_slice_two, atol=0.001
-        ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
-
-
-@slow
-@require_paddle_gpu
-class LoraIntegrationTests(unittest.TestCase):
-    def test_dreambooth_old_format(self):
-        generator = paddle.Generator().manual_seed().manual_seed(0)
-        lora_model_id = "hf-internal-testing/lora_dreambooth_dog_example"
-        card = RepoCard.load(lora_model_id)
-        base_model_id = card.data.to_dict()["base_model"]
-        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
-        pipe.load_lora_weights(lora_model_id)
-        images = pipe(
-            "A photo of a sks dog floating in the river", output_type="np", generator=generator, num_inference_steps=2
-        ).images
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.7207, 0.6787, 0.601, 0.7478, 0.6838, 0.6064, 0.6984, 0.6443, 0.5785])
-        self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
-    def test_dreambooth_text_encoder_new_format(self):
-        generator = paddle.Generator().manual_seed().manual_seed(0)
-        lora_model_id = "hf-internal-testing/lora-trained"
-        card = RepoCard.load(lora_model_id)
-        base_model_id = card.data.to_dict()["base_model"]
-        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
-        pipe.load_lora_weights(lora_model_id)
-        images = pipe("A photo of a sks dog", output_type="np", generator=generator, num_inference_steps=2).images
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.6628, 0.6138, 0.539, 0.6625, 0.613, 0.5463, 0.6166, 0.5788, 0.5359])
-        self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
-    def test_a1111(self):
-        generator = paddle.Generator().manual_seed().manual_seed(0)
-        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
-        lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
-        lora_filename = "light_and_shadow.safetensors"
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-        images = pipe(
-            "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
-        ).images
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.3725, 0.3767, 0.3761, 0.3796, 0.3827, 0.3763, 0.3831, 0.3809, 0.3392])
-        self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
-    def test_vanilla_funetuning(self):
-        generator = paddle.Generator().manual_seed().manual_seed(0)
-        lora_model_id = "hf-internal-testing/sd-model-finetuned-lora-t4"
-        card = RepoCard.load(lora_model_id)
-        base_model_id = card.data.to_dict()["base_model"]
-        pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
-        pipe.load_lora_weights(lora_model_id)
-        images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=2).images
-        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.7406, 0.699, 0.5963, 0.7493, 0.7045, 0.6096, 0.6886, 0.6388, 0.583])
-        self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
-    def test_unload_lora(self):
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "masterpiece, best quality, mountain"
-        num_inference_steps = 2
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
-        initial_images = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        initial_images = initial_images[0, -3:, -3:, -1].flatten()
-        lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
-        lora_filename = "Colored_Icons_by_vizsumit.safetensors"
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-        generator = paddle.Generator().manual_seed(0)
-        lora_images = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        lora_images = lora_images[0, -3:, -3:, -1].flatten()
-        pipe.unload_lora_weights()
-        generator = paddle.Generator().manual_seed(0)
-        unloaded_lora_images = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
-        self.assertFalse(np.allclose(initial_images, lora_images))
-        self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
-
-    def test_load_unload_load_kohya_lora(self):
-        # This test ensures that a Kohya-style LoRA can be safely unloaded and then loaded
-        # without introducing any side-effects. Even though the test uses a Kohya-style
-        # LoRA, the underlying adapter handling mechanism is format-agnostic.
-        generator = paddle.Generator().manual_seed(0)
-        prompt = "masterpiece, best quality, mountain"
-        num_inference_steps = 2
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
-        initial_images = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        initial_images = initial_images[0, -3:, -3:, -1].flatten()
-        lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
-        lora_filename = "Colored_Icons_by_vizsumit.safetensors"
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-        generator = paddle.Generator().manual_seed(0)
-        lora_images = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        lora_images = lora_images[0, -3:, -3:, -1].flatten()
-        pipe.unload_lora_weights()
-        generator = paddle.Generator().manual_seed(0)
-        unloaded_lora_images = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
-        self.assertFalse(np.allclose(initial_images, lora_images))
-        self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
-        # make sure we can load a LoRA again after unloading and they don't have
-        # any undesired effects.
-        pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-        generator = paddle.Generator().manual_seed(0)
-        lora_images_again = pipe(
-            prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
-        ).images
-        lora_images_again = lora_images_again[0, -3:, -3:, -1].flatten()
-        self.assertTrue(np.allclose(lora_images, lora_images_again, atol=0.001))
diff --git a/ppdiffusers/tests/models/test_modeling_common.py b/ppdiffusers/tests/models/test_modeling_common.py
index 557c19f4a..5efe6861a 100644
--- a/ppdiffusers/tests/models/test_modeling_common.py
+++ b/ppdiffusers/tests/models/test_modeling_common.py
@@ -155,7 +155,7 @@ class ModelTesterMixin:
     main_input_name = None  # overwrite in model specific tester class
     base_precision = 1e-3
 
-    def test_from_save_pretrained(self):
+    def test_from_save_pretrained(self, expected_max_diff=1e-01):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict)
         if hasattr(model, "set_default_attn_processor"):
@@ -174,7 +174,7 @@ def test_from_save_pretrained(self):
             if isinstance(new_image, dict):
                 new_image = new_image.to_tuple()[0]
         max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 1e-01, "Models give different forward passes")
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
 
     def test_getattr_is_correct(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -259,7 +259,7 @@ def test_set_attn_processor_for_determinism(self):
         assert paddle.allclose(x=output_2, y=output_5, atol=self.base_precision).item()
         assert paddle.allclose(x=output_2, y=output_6, atol=self.base_precision).item()
 
-    def test_from_save_pretrained_variant(self):
+    def test_from_save_pretrained_variant(self, expected_max_diff=1e-01):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict)
         if hasattr(model, "set_default_attn_processor"):
@@ -289,7 +289,7 @@ def test_from_save_pretrained_variant(self):
             if isinstance(new_image, dict):
                 new_image = new_image.to_tuple()[0]
         max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 1e-01, "Models give different forward passes")
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
 
     def test_from_save_pretrained_dtype(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -393,6 +393,7 @@ def test_ema_training(self):
     def test_outputs_equivalence(self):
         def set_nan_tensor_to_zero(t):
             # t[t != t] = 0
+            t = paddle.nan_to_num(t, 0, 0, 0)
             return t
 
         def recursive_check(tuple_object, dict_object):
@@ -407,7 +408,9 @@ def recursive_check(tuple_object, dict_object):
             else:
                 self.assertTrue(
                     paddle.allclose(
-                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-05
+                        set_nan_tensor_to_zero(tuple_object),
+                        set_nan_tensor_to_zero(dict_object),
+                        atol=5e-05,  # 1e-05, The original test needs to increase the error.
                     ),
                     msg=f"Tuple and dict output are not equal. Difference: {paddle.max(x=paddle.abs(x=tuple_object - dict_object))}. Tuple has `nan`: {paddle.isnan(x=tuple_object).any()} and `inf`: {paddle.isinf(x=tuple_object)}. Dict has `nan`: {paddle.isnan(x=dict_object).any()} and `inf`: {paddle.isinf(x=dict_object)}.",
                 )
diff --git a/ppdiffusers/tests/models/test_models_unet_1d.py b/ppdiffusers/tests/models/test_models_unet_1d.py
index a0b39333f..a26179fb0 100644
--- a/ppdiffusers/tests/models/test_models_unet_1d.py
+++ b/ppdiffusers/tests/models/test_models_unet_1d.py
@@ -59,6 +59,9 @@ def test_outputs_equivalence(self):
     def test_from_save_pretrained(self):
         super().test_from_save_pretrained()
 
+    def test_from_save_pretrained_variant(self):
+        super().test_from_save_pretrained_variant()
+
     def test_model_from_pretrained(self):
         super().test_model_from_pretrained()
 
diff --git a/ppdiffusers/tests/models/test_models_unet_2d.py b/ppdiffusers/tests/models/test_models_unet_2d.py
index 07885618b..fd5e08043 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d.py
@@ -65,6 +65,35 @@ def prepare_init_args_and_inputs_for_common(self):
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
 
+    def test_mid_block_attn_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["norm_num_groups"] = 16
+        init_dict["add_attention"] = True
+        init_dict["attn_norm_num_groups"] = 8
+
+        model = self.model_class(**init_dict)
+        model.eval()
+
+        self.assertIsNotNone(
+            model.mid_block.attentions[0].group_norm, "Mid block Attention group norm should exist but does not."
+        )
+        self.assertEqual(
+            model.mid_block.attentions[0].group_norm._num_groups,
+            init_dict["attn_norm_num_groups"],
+            "Mid block Attention group norm does not have the expected number of groups.",
+        )
+
+        with paddle.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
 
 class UNetLDMModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
     model_class = UNet2DModel
diff --git a/ppdiffusers/tests/models/test_models_unet_2d_condition.py b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
index cc8fb346f..07c541d5c 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import gc
 import os
 import tempfile
@@ -24,10 +25,7 @@
 from pytest import mark
 
 from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.models.attention_processor import (
-    CustomDiffusionAttnProcessor,
-    LoRAAttnProcessor,
-)
+from ppdiffusers.models.attention_processor import CustomDiffusionAttnProcessor
 from ppdiffusers.utils import (
     floats_tensor,
     load_ppnlp_numpy,
@@ -46,28 +44,6 @@
 enable_full_determinism()
 
 
-def create_lora_layers(model, mock_weights: bool = True):
-    lora_attn_procs = {}
-    for name in model.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-        if mock_weights:
-            with paddle.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
-                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
-                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
-                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
-    return lora_attn_procs
-
-
 def create_custom_ppdiffusion_layers(model, mock_weights: bool = True):
     train_kv = True
     train_q_out = True
@@ -184,11 +160,12 @@ def test_gradient_checkpointing(self):
         model_2.clear_gradients()
         loss_2 = (out_2 - labels).mean()
         loss_2.backward()
-        self.assertTrue((loss - loss_2).abs() < 1e-05)
+        # UNetMidBlock2DCrossAttn create_custom_forward associates the difference.
+        self.assertTrue((loss - loss_2).abs() < 1e-5)
         named_params = dict(model.named_parameters())
         named_params_2 = dict(model_2.named_parameters())
         for name, param in named_params.items():
-            self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-05))
+            self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-5))
 
     def test_model_with_attention_head_dim_tuple(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -313,6 +290,42 @@ def check_sliceable_dim_attr(module: paddle.nn.Layer):
         for module in model.children():
             check_sliceable_dim_attr(module)
 
+    def test_gradient_checkpointing_is_applied(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model_class_copy = copy.copy(self.model_class)
+
+        modules_with_gc_enabled = {}
+
+        # now monkey patch the following function:
+        #     def _set_gradient_checkpointing(self, module, value=False):
+        #         if hasattr(module, "gradient_checkpointing"):
+        #             module.gradient_checkpointing = value
+
+        def _set_gradient_checkpointing_new(self, module, value=False):
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = value
+                modules_with_gc_enabled[module.__class__.__name__] = True
+
+        model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
+
+        model = model_class_copy(**init_dict)
+        model.enable_gradient_checkpointing()
+
+        EXPECTED_SET = {
+            "CrossAttnUpBlock2D",
+            "CrossAttnDownBlock2D",
+            "UNetMidBlock2DCrossAttn",
+            "UpBlock2D",
+            "Transformer2DModel",
+            "DownBlock2D",
+        }
+
+        assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
     def test_special_attn_proc(self):
         class AttnEasyProc(nn.Layer):
             def __init__(self, num):
@@ -405,139 +418,6 @@ def test_model_xattn_padding(self):
                 y=keeplast_out, rtol=1e-3, atol=1e-5
             ).item(), "a mask with fewer tokens than condition, will be padded with 'keep' tokens. a 'discard-all' mask missing the final token is thus equivalent to a 'keep last' mask."
 
-    def test_lora_processors(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        model = self.model_class(**init_dict)
-        with paddle.no_grad():
-            sample1 = model(**inputs_dict).sample
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        model.set_attn_processor(model.attn_processors)
-        with paddle.no_grad():
-            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-        assert (sample1 - sample2).abs().max() < 3e-3
-        assert (sample3 - sample4).abs().max() < 3e-3
-        assert (sample2 - sample3).abs().max() > 3e-3
-
-    def test_lora_save_load(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, to_diffusers=False)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, from_diffusers=False)
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_load_safetensors(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-        assert (sample - new_sample).abs().max() < 0.0001
-        assert (sample - old_sample).abs().max() > 0.0001
-
-    # def test_lora_save_safetensors_load_torch(self):
-    #     # enable deterministic behavior for gradient checkpointing
-    #     init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-    #     init_dict["attention_head_dim"] = (8, 16)
-
-    #     paddle.seed(0)
-    #     model = self.model_class(**init_dict)
-
-    #     lora_attn_procs = create_lora_layers(model, mock_weights=False)
-    #     model.set_attn_processor(lora_attn_procs)
-    #     # Saving as paddle, properly reloads with directly filename
-    #     with tempfile.TemporaryDirectory() as tmpdirname:
-    #         model.save_attn_procs(tmpdirname, to_diffusers=True)
-    #         self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-    #         paddle.seed(0)
-    #         new_model = self.model_class(**init_dict)
-    #         new_model.load_attn_procs(
-    #             tmpdirname, weight_name="pytorch_lora_weights.bin", from_diffusers=True, use_safetensors=False
-    #         )
-
-    def test_lora_save_torch_force_load_safetensors_error(self):
-        pass
-
-    def test_lora_on_off(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-        model.set_default_attn_processor()
-        with paddle.no_grad():
-            new_sample = model(**inputs_dict).sample
-        assert (sample - new_sample).abs().max() < 0.0001
-        assert (sample - old_sample).abs().max() < 3e-3
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="scaled_dot_product_attention attention is only available with CUDA and `scaled_dot_product_attention` installed",
-    )
-    def test_lora_xformers_on_off(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        init_dict["attention_head_dim"] = 8, 16
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-        assert (sample - on_sample).abs().max() < 0.05
-        assert (sample - off_sample).abs().max() < 0.05
-
     def test_custom_diffusion_processors(self):
         # enable deterministic behavior for gradient checkpointing
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
diff --git a/ppdiffusers/tests/models/test_models_unet_3d_condition.py b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
index e1fcdee1c..5eebe37e2 100644
--- a/ppdiffusers/tests/models/test_models_unet_3d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
@@ -12,15 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import tempfile
 import unittest
 
 import numpy as np
 import paddle
 
 from ppdiffusers.models import UNet3DConditionModel
-from ppdiffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor
 from ppdiffusers.utils import floats_tensor, logging
 from ppdiffusers.utils.import_utils import is_ppxformers_available
 from ppdiffusers.utils.testing_utils import enable_full_determinism
@@ -32,37 +29,6 @@
 logger = logging.get_logger(__name__)
 
 
-def create_lora_layers(model, mock_weights: bool = True):
-    lora_attn_procs = {}
-    for name in model.attn_processors.keys():
-        has_cross_attention = name.endswith("attn2.processor") and not (
-            name.startswith("transformer_in") or "temp_attentions" in name.split(".")
-        )
-        cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-        elif name.startswith("transformer_in"):
-            # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
-            hidden_size = 8 * model.config.attention_head_dim
-
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-
-        if mock_weights:
-            # add 1 to weights to mock trained weights
-            with paddle.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
-                lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
-                lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
-                lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
-    return lora_attn_procs
-
-
 class UNet3DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
     model_class = UNet3DConditionModel
     main_input_name = "sample"
@@ -172,179 +138,6 @@ def test_model_attention_slicing(self):
             output = model(**inputs_dict)
         assert output is not None
 
-    def test_lora_processors(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            sample1 = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-
-        # make sure we can set a list of attention processors
-        model.set_attn_processor(lora_attn_procs)
-
-        # test that attn processors can be set to itself
-        model.set_attn_processor(model.attn_processors)
-
-        with paddle.no_grad():
-            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample1 - sample2).abs().max() < 3e-3
-        assert (sample3 - sample4).abs().max() < 3e-3
-
-        # sample 2 and sample 3 should be different
-        assert (sample2 - sample3).abs().max() > 3e-3
-
-    def test_lora_save_load(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(
-                tmpdirname,
-                to_diffusers=False,
-            )
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, from_diffusers=False)
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-3
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_load_safetensors(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-            paddle.seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
-
-        with paddle.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 3e-3
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    # def test_lora_save_safetensors_load_torch(self):
-    #     # enable deterministic behavior for gradient checkpointing
-    #     init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-    #     init_dict["attention_head_dim"] = 8
-
-    #     paddle.seed(0)
-    #     model = self.model_class(**init_dict)
-
-    #     lora_attn_procs = create_lora_layers(model, mock_weights=False)
-    #     model.set_attn_processor(lora_attn_procs)
-    #     # Saving as paddle, properly reloads with directly filename
-    #     with tempfile.TemporaryDirectory() as tmpdirname:
-    #         model.save_attn_procs(tmpdirname, to_diffusers=True)
-    #         self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-    #         paddle.seed(0)
-    #         new_model = self.model_class(**init_dict)
-    #         new_model.load_attn_procs(
-    #             tmpdirname, weight_name="pytorch_lora_weights.bin", use_safetensors=False, from_diffusers=True
-    #         )
-
-    def test_lora_save_paddle_force_load_safetensors_error(self):
-        pass
-
-    def test_lora_on_off(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-
-        with paddle.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with paddle.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-
-        model.set_attn_processor(AttnProcessor())
-
-        with paddle.no_grad():
-            new_sample = model(**inputs_dict).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-        assert (sample - old_sample).abs().max() < 3e-3
-
-    @unittest.skipIf(
-        not is_ppxformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_lora_xformers_on_off(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 4
-
-        paddle.seed(0)
-        model = self.model_class(**init_dict)
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        # default
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
-
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-
-        assert (sample - on_sample).abs().max() < 0.005
-        assert (sample - off_sample).abs().max() < 0.005
-
     def test_feed_forward_chunking(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         init_dict["norm_num_groups"] = 32

From 84ab2b8f44ae035418e18e4fa608cae3bd300876 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 30 Nov 2023 16:53:31 +0800
Subject: [PATCH 02/19] Fix

---
 ppdiffusers/ppdiffusers/models/transformer_2d.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py
index fdaa135c0..b750ac7b2 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_2d.py
@@ -330,9 +330,10 @@ def forward(
             if self.use_linear_projection:
                 hidden_states = self.proj_in(hidden_states, scale=lora_scale)
         elif self.is_input_vectorized:
-            # paddle original code:
-            # hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
-            hidden_states = self.latent_image_embedding(hidden_states)
+            # pytorch code:
+            # hidden_states = self.latent_image_embedding(hidden_states)
+            # paddle code, _C_ops.embedding not support float32, need convert to int64:
+            hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
         elif self.is_input_patches:
             height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
             hidden_states = self.pos_embed(hidden_states)

From 73bc623746eeacf0d15a581c414e264d10cf0b2d Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 30 Nov 2023 17:10:49 +0800
Subject: [PATCH 03/19] Fix

---
 .../pipelines/stable_diffusion/test_stable_diffusion.py      | 5 +++--
 .../stable_diffusion/test_stable_diffusion_inpaint.py        | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 65bfc6d6a..8bae56e28 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 
 import gc
-import tempfile
+
+# import tempfile
 import unittest
 
 import numpy as np
@@ -36,7 +37,7 @@
 from ppdiffusers.utils import nightly, slow
 from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
 
-from ...models.test_models_unet_2d_condition import create_lora_layers
+from ...lora.test_lora_layers import create_lora_layers
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 0b8fca03d..98a974cfa 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -40,7 +40,7 @@
     require_paddle_gpu,
 )
 
-from ...models.test_models_unet_2d_condition import create_lora_layers
+from ...lora.test_lora_layers import create_lora_layers
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
     TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,

From 88b283f85daebdf24267fdab4cbfd8c5c866b374 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Sat, 2 Dec 2023 14:57:06 +0800
Subject: [PATCH 04/19] ci


From 2cb555f59d8655737ebe42def5eece0c74e12192 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Sat, 2 Dec 2023 15:56:43 +0800
Subject: [PATCH 05/19] Fix

---
 ppdiffusers/ppdiffusers/models/adapter.py     | 345 ++++++++++++++++--
 .../pipeline_stable_diffusion_adapter.py      | 200 ++++++++--
 2 files changed, 488 insertions(+), 57 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/adapter.py b/ppdiffusers/ppdiffusers/models/adapter.py
index 783fde367..cd41c7218 100644
--- a/ppdiffusers/ppdiffusers/models/adapter.py
+++ b/ppdiffusers/ppdiffusers/models/adapter.py
@@ -12,15 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import List, Optional
+import os
+from typing import Callable, List, Optional, Union
 
 import paddle
 import paddle.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
 from .modeling_utils import ModelMixin
-from .resnet import Downsample2D
+
+logger = logging.get_logger(__name__)
 
 
 class MultiAdapter(ModelMixin):
@@ -38,9 +40,39 @@ class MultiAdapter(ModelMixin):
 
     def __init__(self, adapters: List["T2IAdapter"]):
         super(MultiAdapter, self).__init__()
+
         self.num_adapter = len(adapters)
         self.adapters = nn.LayerList(adapters)
 
+        if len(adapters) == 0:
+            raise ValueError("Expecting at least one adapter")
+
+        if len(adapters) == 1:
+            raise ValueError("For a single adapter, please use the `T2IAdapter` class instead of `MultiAdapter`")
+
+        # The outputs from each adapter are added together with a weight.
+        # This means that the change in dimensions from downsampling must
+        # be the same for all adapters. Inductively, it also means the
+        # downscale_factor and total_downscale_factor must be the same for all
+        # adapters.
+        first_adapter_total_downscale_factor = adapters[0].total_downscale_factor
+        first_adapter_downscale_factor = adapters[0].downscale_factor
+        for idx in range(1, len(adapters)):
+            if (
+                adapters[idx].total_downscale_factor != first_adapter_total_downscale_factor
+                or adapters[idx].downscale_factor != first_adapter_downscale_factor
+            ):
+                raise ValueError(
+                    f"Expecting all adapters to have the same downscaling behavior, but got:\n"
+                    f"adapters[0].total_downscale_factor={first_adapter_total_downscale_factor}\n"
+                    f"adapters[0].downscale_factor={first_adapter_downscale_factor}\n"
+                    f"adapter[`{idx}`].total_downscale_factor={adapters[idx].total_downscale_factor}\n"
+                    f"adapter[`{idx}`].downscale_factor={adapters[idx].downscale_factor}"
+                )
+
+        self.total_downscale_factor = first_adapter_total_downscale_factor
+        self.downscale_factor = first_adapter_downscale_factor
+
     def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = None) -> List[paddle.Tensor]:
         """
         Args:
@@ -55,21 +87,132 @@ def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = No
             adapter_weights = paddle.to_tensor([1 / self.num_adapter] * self.num_adapter)
         else:
             adapter_weights = paddle.to_tensor(adapter_weights)
-        if xs.shape[1] % self.num_adapter != 0:
-            raise ValueError(
-                f"Expecting multi-adapter's input have number of channel that cab be evenly divisible by num_adapter: {xs.shape[1]} % {self.num_adapter} != 0"
-            )
-        x_list = paddle.chunk(xs, chunks=self.num_adapter, axis=1)
+
         accume_state = None
-        for x, w, adapter in zip(x_list, adapter_weights, self.adapters):
+        for x, w, adapter in zip(xs, adapter_weights, self.adapters):
             features = adapter(x)
             if accume_state is None:
                 accume_state = features
+                for i in range(len(accume_state)):
+                    accume_state[i] = w * accume_state[i]
             else:
                 for i in range(len(features)):
                     accume_state[i] += w * features[i]
         return accume_state
 
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~models.adapter.MultiAdapter.from_pretrained`]` class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        idx = 0
+        model_path_to_save = save_directory
+        for adapter in self.adapters:
+            adapter.save_pretrained(
+                model_path_to_save,
+                is_main_process=is_main_process,
+                save_function=save_function,
+                safe_serialization=safe_serialization,
+                variant=variant,
+            )
+
+            idx += 1
+            model_path_to_save = model_path_to_save + f"_{idx}"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained MultiAdapter model from multiple pre-trained adapter models.
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+
+        Parameters:
+            pretrained_model_path (`os.PathLike`):
+                A path to a *directory* containing model weights saved using
+                [`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
+        """
+        idx = 0
+        adapters = []
+
+        # load adapter and append to list until no adapter directory exists anymore
+        # first adapter has to be saved under `./mydirectory/adapter` to be compliant with `DiffusionPipeline.from_pretrained`
+        # second, third, ... adapters have to be saved under `./mydirectory/adapter_1`, `./mydirectory/adapter_2`, ...
+        model_path_to_load = pretrained_model_path
+        while os.path.isdir(model_path_to_load):
+            adapter = T2IAdapter.from_pretrained(model_path_to_load, **kwargs)
+            adapters.append(adapter)
+
+            idx += 1
+            model_path_to_load = pretrained_model_path + f"_{idx}"
+
+        logger.info(f"{len(adapters)} adapters loaded from {pretrained_model_path}.")
+
+        if len(adapters) == 0:
+            raise ValueError(
+                f"No T2IAdapters found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
+            )
+
+        return cls(adapters)
+
 
 class T2IAdapter(ModelMixin, ConfigMixin):
     r"""
@@ -79,8 +222,10 @@ class T2IAdapter(ModelMixin, ConfigMixin):
     [Adapter](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L97)
      and
      [AdapterLight](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L235).
+
     This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
     implements for all the model (such as downloading or saving, etc.)
+
     Parameters:
         in_channels (`int`, *optional*, defaults to 3):
             Number of channels of Aapter's input(*control image*). Set this parameter to 1 if you're using gray scale
@@ -89,7 +234,11 @@ class T2IAdapter(ModelMixin, ConfigMixin):
             The number of channel of each downsample block's output hidden state. The `len(block_out_channels)` will
             also determine the number of downsample blocks in the Adapter.
         num_res_blocks (`int`, *optional*, defaults to 2):
-            Number of ResNet blocks in each downsample block
+            Number of ResNet blocks in each downsample block.
+        downscale_factor (`int`, *optional*, defaults to 8):
+            A factor that determines the total downscale factor of the Adapter.
+        adapter_type (`str`, *optional*, defaults to `full_adapter`):
+            The type of Adapter to use. Choose either `full_adapter` or `full_adapter_xl` or `light_adapter`.
     """
 
     @register_to_config
@@ -105,23 +254,45 @@ def __init__(
 
         if adapter_type == "full_adapter":
             self.adapter = FullAdapter(in_channels, channels, num_res_blocks, downscale_factor)
+        elif adapter_type == "full_adapter_xl":
+            self.adapter = FullAdapterXL(in_channels, channels, num_res_blocks, downscale_factor)
         elif adapter_type == "light_adapter":
             self.adapter = LightAdapter(in_channels, channels, num_res_blocks, downscale_factor)
         else:
-            raise ValueError(f"unknown adapter_type: {type}. Choose either 'full_adapter' or 'simple_adapter'")
+            raise ValueError(
+                f"Unsupported adapter_type: '{adapter_type}'. Choose either 'full_adapter' or "
+                "'full_adapter_xl' or 'light_adapter'."
+            )
 
     def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
+        r"""
+        This function processes the input tensor `x` through the adapter model and returns a list of feature tensors,
+        each representing information extracted at a different scale from the input. The length of the list is
+        determined by the number of downsample blocks in the Adapter, as specified by the `channels` and
+        `num_res_blocks` parameters during initialization.
+        """
         return self.adapter(x)
 
     @property
     def total_downscale_factor(self):
         return self.adapter.total_downscale_factor
 
+    @property
+    def downscale_factor(self):
+        """The downscale factor applied in the T2I-Adapter's initial pixel unshuffle operation. If an input image's dimensions are
+        not evenly divisible by the downscale_factor then an exception will be raised.
+        """
+        return self.adapter.unshuffle.downscale_factor
+
 
 # full adapter
 
 
 class FullAdapter(nn.Layer):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
     def __init__(
         self,
         in_channels: int = 3,
@@ -149,6 +320,62 @@ def __init__(
         self.total_downscale_factor = downscale_factor * 2 ** (len(channels) - 1)
 
     def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
+        r"""
+        This method processes the input tensor `x` through the FullAdapter model and performs operations including
+        pixel unshuffling, convolution, and a stack of AdapterBlocks. It returns a list of feature tensors, each
+        capturing information at a different stage of processing within the FullAdapter model. The number of feature
+        tensors in the list is determined by the number of downsample blocks specified during initialization.
+        """
+        x = self.unshuffle(x)
+        x = self.conv_in(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class FullAdapterXL(nn.Layer):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 16,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.conv_in = nn.Conv2D(in_channels, channels[0], kernel_size=3, padding=1)
+
+        self.body = []
+        # blocks to extract XL features with dimensions of [320, 64, 64], [640, 64, 64], [1280, 32, 32], [1280, 32, 32]
+        for i in range(len(channels)):
+            if i == 1:
+                self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks))
+            elif i == 2:
+                self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks, down=True))
+            else:
+                self.body.append(AdapterBlock(channels[i], channels[i], num_res_blocks))
+
+        self.body = nn.ModuleList(self.body)
+        # XL has only one downsampling AdapterBlock.
+        self.total_downscale_factor = downscale_factor * 2
+
+    def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
+        r"""
+        This method takes the tensor x as input and processes it through FullAdapterXL model. It consists of operations
+        including unshuffling pixels, applying convolution layer and appending each block into list of feature tensors.
+        """
         x = self.unshuffle(x)
         x = self.conv_in(x)
 
@@ -162,12 +389,27 @@ def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
 
 
 class AdapterBlock(nn.Layer):
-    def __init__(self, in_channels, out_channels, num_res_blocks, down=False):
+    r"""
+    An AdapterBlock is a helper model that contains multiple ResNet-like blocks. It is used in the `FullAdapter` and
+    `FullAdapterXL` models.
+
+    Parameters:
+        in_channels (`int`):
+            Number of channels of AdapterBlock's input.
+        out_channels (`int`):
+            Number of channels of AdapterBlock's output.
+        num_res_blocks (`int`):
+            Number of ResNet blocks in the AdapterBlock.
+        down (`bool`, *optional*, defaults to `False`):
+            Whether to perform downsampling on AdapterBlock's input.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, down: bool = False):
         super().__init__()
 
         self.downsample = None
         if down:
-            self.downsample = Downsample2D(in_channels)
+            self.downsample = nn.AvgPool2D(kernel_size=2, stride=2, ceil_mode=True)
 
         self.in_conv = None
         if in_channels != out_channels:
@@ -177,7 +419,12 @@ def __init__(self, in_channels, out_channels, num_res_blocks, down=False):
             *[AdapterResnetBlock(out_channels) for _ in range(num_res_blocks)],
         )
 
-    def forward(self, x):
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""
+        This method takes tensor x as input and performs operations downsampling and convolutional layers if the
+        self.downsample and self.in_conv properties of AdapterBlock model are specified. Then it applies a series of
+        residual blocks to the input tensor.
+        """
         if self.downsample is not None:
             x = self.downsample(x)
 
@@ -190,6 +437,14 @@ def forward(self, x):
 
 
 class AdapterResnetBlock(nn.Layer):
+    r"""
+    An `AdapterResnetBlock` is a helper model that implements a ResNet-like block.
+
+    Parameters:
+        channels (`int`):
+            Number of channels of AdapterResnetBlock's input and output.
+    """
+
     def __init__(self, channels):
         super().__init__()
         self.block1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1)
@@ -197,9 +452,12 @@ def __init__(self, channels):
         self.block2 = nn.Conv2D(channels, channels, kernel_size=1)
 
     def forward(self, x):
-        h = x
-        h = self.block1(h)
-        h = self.act(h)
+        r"""
+        This method takes input tensor x and applies a convolutional layer, ReLU activation, and another convolutional
+        layer on the input tensor. It returns addition with the input tensor.
+        """
+
+        h = self.act(self.block1(x))
         h = self.block2(h)
 
         return h + x
@@ -209,6 +467,10 @@ def forward(self, x):
 
 
 class LightAdapter(nn.Layer):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
     def __init__(
         self,
         in_channels: int = 3,
@@ -235,7 +497,11 @@ def __init__(
 
         self.total_downscale_factor = downscale_factor * (2 ** len(channels))
 
-    def forward(self, x):
+    def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
+        r"""
+        This method takes the input tensor x and performs downscaling and appends it in list of feature tensors. Each
+        feature tensor corresponds to a different level of processing within the LightAdapter.
+        """
         x = self.unshuffle(x)
 
         features = []
@@ -248,19 +514,38 @@ def forward(self, x):
 
 
 class LightAdapterBlock(nn.Layer):
+    r"""
+    A `LightAdapterBlock` is a helper model that contains multiple `LightAdapterResnetBlocks`. It is used in the
+    `LightAdapter` model.
+
+    Parameters:
+        in_channels (`int`):
+            Number of channels of LightAdapterBlock's input.
+        out_channels (`int`):
+            Number of channels of LightAdapterBlock's output.
+        num_res_blocks (`int`):
+            Number of LightAdapterResnetBlocks in the LightAdapterBlock.
+        down (`bool`, *optional*, defaults to `False`):
+            Whether to perform downsampling on LightAdapterBlock's input.
+    """
+
     def __init__(self, in_channels, out_channels, num_res_blocks, down=False):
         super().__init__()
         mid_channels = out_channels // 4
 
         self.downsample = None
         if down:
-            self.downsample = Downsample2D(in_channels)
+            self.downsample = nn.AvgPool2D(kernel_size=2, stride=2, ceil_mode=True)
 
         self.in_conv = nn.Conv2D(in_channels, mid_channels, kernel_size=1)
         self.resnets = nn.Sequential(*[LightAdapterResnetBlock(mid_channels) for _ in range(num_res_blocks)])
         self.out_conv = nn.Conv2D(mid_channels, out_channels, kernel_size=1)
 
-    def forward(self, x):
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""
+        This method takes tensor x as input and performs downsampling if required. Then it applies in convolution
+        layer, a sequence of residual blocks, and out convolutional layer.
+        """
         if self.downsample is not None:
             x = self.downsample(x)
 
@@ -272,16 +557,28 @@ def forward(self, x):
 
 
 class LightAdapterResnetBlock(nn.Layer):
+    """
+    A `LightAdapterResnetBlock` is a helper model that implements a ResNet-like block with a slightly different
+    architecture than `AdapterResnetBlock`.
+
+    Parameters:
+        channels (`int`):
+            Number of channels of LightAdapterResnetBlock's input and output.
+    """
+
     def __init__(self, channels):
         super().__init__()
         self.block1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1)
         self.act = nn.ReLU()
         self.block2 = nn.Conv2D(channels, channels, kernel_size=3, padding=1)
 
-    def forward(self, x):
-        h = x
-        h = self.block1(h)
-        h = self.act(h)
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""
+        This function takes input tensor x and processes it through one convolutional layer, ReLU activation, and
+        another convolutional layer and adds it to input tensor.
+        """
+
+        h = self.act(self.block1(x))
         h = self.block2(h)
 
         return h + x
diff --git a/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index d6aa74e1e..573e9c2bf 100644
--- a/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -25,10 +25,12 @@
 from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
     PIL_INTERPOLATION,
     BaseOutput,
+    deprecate,
     logging,
     randn_tensor,
     replace_example_docstring,
@@ -89,9 +91,12 @@ def _preprocess_adapter_image(image, height, width):
         return image
     elif isinstance(image, PIL.Image.Image):
         image = [image]
+
     if isinstance(image[0], PIL.Image.Image):
         image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
-        image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
         image = np.concatenate(image, axis=0)
         image = np.array(image).astype(np.float32) / 255.0
         image = image.transpose(0, 3, 1, 2)
@@ -161,11 +166,17 @@ def __init__(
         super().__init__()
         if safety_checker is None and requires_safety_checker:
             logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
         if safety_checker is not None and feature_extractor is None:
             raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
         if isinstance(adapter, (list, tuple)):
             adapter = MultiAdapter(adapter, adapter_weights=adapter_weights)
@@ -209,12 +220,44 @@ def _encode_prompt(
         prompt_embeds: Optional[paddle.Tensor] = None,
         negative_prompt_embeds: Optional[paddle.Tensor] = None,
         lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[paddle.Tensor] = None,
+        negative_prompt_embeds: Optional[paddle.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
     ):
         """
         Encodes the prompt into text encoder hidden states.
 
         Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
             num_images_per_prompt (`int`):
                 number of images that should be generated per prompt
@@ -232,12 +275,19 @@ def _encode_prompt(
                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                 argument.
             lora_scale (`float`, *optional*):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
         """
         # set lora scale so that monkey patched LoRA
         # function of text encoder can correctly access it
         if lora_scale is not None and isinstance(self, LoraLoaderMixin):
             self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -265,15 +315,39 @@ def _encode_prompt(
                     untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
                 )
                 logger.warning(
-                    f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
             if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
                 attention_mask = text_inputs.attention_mask
             else:
                 attention_mask = None
-            prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
-            prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids, attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
@@ -286,13 +360,16 @@ def _encode_prompt(
                 uncond_tokens = [""] * batch_size
             elif prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
                 )
             elif isinstance(negative_prompt, str):
                 uncond_tokens = [negative_prompt]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
                 )
             else:
                 uncond_tokens = negative_prompt
@@ -300,6 +377,7 @@ def _encode_prompt(
             # textual inversion: procecss multi-vector tokens if necessary
             if isinstance(self, TextualInversionLoaderMixin):
                 uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
             max_length = prompt_embeds.shape[1]
             uncond_input = self.tokenizer(
                 uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd"
@@ -317,11 +395,7 @@ def _encode_prompt(
             negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
             negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
 
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
-        return prompt_embeds
+        return prompt_embeds, negative_prompt_embeds
 
     # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, dtype):
@@ -414,7 +488,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
         if latents is None:
             latents = randn_tensor(shape, generator=generator, dtype=dtype)
@@ -447,6 +522,34 @@ def _default_height_width(self, height, width, image):
             width = width // self.adapter.total_downscale_factor * self.adapter.total_downscale_factor
         return height, width
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
     @paddle.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -470,6 +573,7 @@ def __call__(
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
     ):
         """
         Function invoked when calling the pipeline for generation.
@@ -538,7 +642,9 @@ def __call__(
                 The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
                 residual in the original unet. If multiple adapters are specified in init, you can set the
                 corresponding scale as a list.
-
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
         Examples:
 
         Returns:
@@ -555,14 +661,26 @@ def __call__(
         self.check_inputs(
             prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
         )
-        is_multi_adapter = isinstance(self.adapter, MultiAdapter)
-        if is_multi_adapter:
-            adapter_input = [_preprocess_adapter_image(img, height, width) for img in image]
-            n, c, h, w = adapter_input[0].shape
-            adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
+
+        # is_multi_adapter = isinstance(self.adapter, MultiAdapter)
+        # if is_multi_adapter:
+        #     adapter_input = [_preprocess_adapter_image(img, height, width) for img in image]
+        #     n, c, h, w = adapter_input[0].shape
+        #     adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
+        # else:
+        #     adapter_input = _preprocess_adapter_image(image, height, width)
+        # adapter_input = adapter_input.cast(self.adapter.dtype)
+
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_input = []
+
+            for one_image in image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.cast(dtype=self.adapter.dtype)
+                adapter_input.append(one_image)
         else:
             adapter_input = _preprocess_adapter_image(image, height, width)
-        adapter_input = adapter_input.cast(self.adapter.dtype)
+            adapter_input = adapter_input.cast(dtype=self.adapter.dtype)
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -578,14 +696,20 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             num_images_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
         )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -607,15 +731,21 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
-        adapter_state = self.adapter(adapter_input)
-        for k, v in enumerate(adapter_state):
-            adapter_state[k] = v * adapter_conditioning_scale
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = self.adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
         if num_images_per_prompt > 1:
             for k, v in enumerate(adapter_state):
                 adapter_state[k] = v.tile(repeat_times=[num_images_per_prompt, 1, 1, 1])
         if do_classifier_free_guidance:
             for k, v in enumerate(adapter_state):
                 adapter_state[k] = paddle.concat(x=[v] * 2, axis=0)
+
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -629,8 +759,9 @@ def __call__(
                     t,
                     encoder_hidden_states=prompt_embeds,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    down_block_additional_residuals=[state.clone() for state in adapter_state],
-                ).sample
+                    down_intrablock_additional_residuals=[state.clone() for state in adapter_state],
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
@@ -641,10 +772,12 @@ def __call__(
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
@@ -665,5 +798,6 @@ def __call__(
             image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
 
         if not return_dict:
-            return image, has_nsfw_concept
+            return (image, has_nsfw_concept)
+
         return StableDiffusionAdapterPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From f9eebb1501ec686d53f3253437422dd9036b5be4 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Sat, 2 Dec 2023 20:44:38 +0800
Subject: [PATCH 06/19] Fix

---
 ppdiffusers/ppdiffusers/models/unet_2d_condition.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
index 041106de9..274d8d32b 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
@@ -1096,10 +1096,13 @@ def forward(
                 #     sample += down_block_additional_residuals.pop(0)
                 #     # westfish: add to align with torch features
                 #     res_samples = tuple(res_samples[:-1]) + (sample,)
+
                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
                 if is_adapter and len(down_intrablock_additional_residuals) > 0:
                     sample += down_intrablock_additional_residuals.pop(0)
-
+                    # pytorch without the following line:
+                    # westfish: add to align with torch features
+                    res_samples = tuple(res_samples[:-1]) + (sample,)
             down_block_res_samples += res_samples
 
         if is_controlnet:

From 22f3743ab7a4af9fb501979a00fbc0adb84e9feb Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Tue, 5 Dec 2023 09:45:04 +0800
Subject: [PATCH 07/19] Fix

---
 .../{test_lora_layers.py => test_lora_layers_old_backend.py}  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename ppdiffusers/tests/lora/{test_lora_layers.py => test_lora_layers_old_backend.py} (99%)

diff --git a/ppdiffusers/tests/lora/test_lora_layers.py b/ppdiffusers/tests/lora/test_lora_layers_old_backend.py
similarity index 99%
rename from ppdiffusers/tests/lora/test_lora_layers.py
rename to ppdiffusers/tests/lora/test_lora_layers_old_backend.py
index 53fa7687b..9ee791202 100644
--- a/ppdiffusers/tests/lora/test_lora_layers.py
+++ b/ppdiffusers/tests/lora/test_lora_layers_old_backend.py
@@ -899,8 +899,8 @@ def test_lora_fusion_is_not_affected_by_unloading(self):
         images_with_unloaded_lora = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
         images_with_unloaded_lora_slice = images_with_unloaded_lora[0, -3:, -3:, -1]
 
-        assert np.allclose(
-            lora_image_slice, images_with_unloaded_lora_slice, atol=0.05
+        assert (
+            np.abs(lora_image_slice - images_with_unloaded_lora_slice).max() < 2e-1  # diffusers 0.23
         ), "`unload_lora_weights()` should have not effect on the semantics of the results as the LoRA parameters were fused."
 
     def test_fuse_lora_with_different_scales(self):

From 02854da1cd101c56e32b69a0b69bb92da30c86b2 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Tue, 5 Dec 2023 12:38:16 +0800
Subject: [PATCH 08/19] Fix

---
 ppdiffusers/ppdiffusers/models/unet_2d_blocks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
index e4e5e1ce0..5b302e53b 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
@@ -1193,7 +1193,7 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                 )  # [0]
             else:
-                hidden_states = resnet(hidden_states, temb)
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,

From 2bea09acc2d3394f0404c48eed708dc0753b80d4 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Tue, 5 Dec 2023 14:04:31 +0800
Subject: [PATCH 09/19] Fix

---
 ppdiffusers/ppdiffusers/utils/paddle_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
index 06a162025..b626db689 100644
--- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
@@ -243,19 +243,19 @@ def fourier_filter(x_in, threshold, scale):
         x = x.to(dtype=paddle.float32)
 
     # FFT
-    x_freq = fftn(x, dim=(-2, -1))
-    x_freq = fftshift(x_freq, dim=(-2, -1))
+    x_freq = fftn(x, axes=(-2, -1))
+    x_freq = fftshift(x_freq, axes=(-2, -1))
 
     B, C, H, W = x_freq.shape
-    mask = paddle.ones((B, C, H, W), device=x.device)
+    mask = paddle.ones((B, C, H, W))
 
     crow, ccol = H // 2, W // 2
     mask[..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold] = scale
     x_freq = x_freq * mask
 
     # IFFT
-    x_freq = ifftshift(x_freq, dim=(-2, -1))
-    x_filtered = ifftn(x_freq, dim=(-2, -1)).real
+    x_freq = ifftshift(x_freq, axes=(-2, -1))
+    x_filtered = ifftn(x_freq, axes=(-2, -1)).real
 
     return x_filtered.to(dtype=x_in.dtype)
 

From ca566e9840722f7e55f313d0f581db96ffe328ce Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Tue, 5 Dec 2023 15:25:56 +0800
Subject: [PATCH 10/19] Fix

---
 ppdiffusers/ppdiffusers/models/lora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppdiffusers/ppdiffusers/models/lora.py b/ppdiffusers/ppdiffusers/models/lora.py
index 45c5d4cb2..09b47192d 100644
--- a/ppdiffusers/ppdiffusers/models/lora.py
+++ b/ppdiffusers/ppdiffusers/models/lora.py
@@ -236,7 +236,7 @@ def forward(self, hidden_states, scale: float = 1.0):
             )
         else:
             original_outputs = F.conv2d(
-                hidden_states, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+                hidden_states, self.weight, self.bias, self._stride, self._padding, self._dilation, self._groups
             )
             return original_outputs + (scale * self.lora_layer(hidden_states))
 

From 6891a2bfb29cb86e4f06e56f3a04aa9ebb693e1d Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 7 Dec 2023 09:42:46 +0800
Subject: [PATCH 11/19] Fix

---
 ppdiffusers/ppdiffusers/__init__.py           |   2 +
 ppdiffusers/ppdiffusers/models/__init__.py    |   1 +
 .../ppdiffusers/models/autoencoder_kl.py      |  74 +-
 .../ppdiffusers/models/autoencoder_tiny.py    | 347 +++++++
 ppdiffusers/ppdiffusers/models/controlnet.py  |  71 +-
 .../ppdiffusers/models/dual_transformer_2d.py |   4 +
 ppdiffusers/ppdiffusers/models/embeddings.py  |  10 +-
 .../ppdiffusers/models/t5_film_transformer.py | 137 ++-
 .../models/transformer_temporal.py            |  52 +-
 .../ppdiffusers/models/unet_motion_model.py   | 874 ++++++++++++++++++
 ppdiffusers/ppdiffusers/models/vae.py         | 316 ++++++-
 ppdiffusers/ppdiffusers/models/vq_model.py    |  24 +-
 .../tests/models/test_models_unet_motion.py   | 299 ++++++
 13 files changed, 2108 insertions(+), 103 deletions(-)
 create mode 100644 ppdiffusers/ppdiffusers/models/autoencoder_tiny.py
 create mode 100644 ppdiffusers/ppdiffusers/models/unet_motion_model.py
 create mode 100644 ppdiffusers/tests/models/test_models_unet_motion.py

diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py
index 616a2f56d..17682f6f9 100644
--- a/ppdiffusers/ppdiffusers/__init__.py
+++ b/ppdiffusers/ppdiffusers/__init__.py
@@ -63,6 +63,7 @@
         LVDMAutoencoderKL,
         LVDMUNet3DModel,
         ModelMixin,
+        MotionAdapter,
         MultiAdapter,
         PriorTransformer,
         T2IAdapter,
@@ -72,6 +73,7 @@
         UNet2DConditionModel,
         UNet2DModel,
         UNet3DConditionModel,
+        UNetMotionModel,
         VQModel,
     )
     from .optimization import (
diff --git a/ppdiffusers/ppdiffusers/models/__init__.py b/ppdiffusers/ppdiffusers/models/__init__.py
index 08192f41d..431b806c9 100644
--- a/ppdiffusers/ppdiffusers/models/__init__.py
+++ b/ppdiffusers/ppdiffusers/models/__init__.py
@@ -37,6 +37,7 @@
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
     from .unet_3d_condition import UNet3DConditionModel
+    from .unet_motion_model import MotionAdapter, UNetMotionModel
     from .vq_model import VQModel
 
 try:
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
index 9a0e4eb87..bf1078ff1 100644
--- a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+++ b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
@@ -21,7 +21,13 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import FromOriginalVAEMixin
 from ..utils import BaseOutput, apply_forward_hook
-from .attention_processor import AttentionProcessor, AttnProcessor
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
@@ -183,8 +189,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         processors = {}
 
         def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
 
             for sub_name, child in module.named_children():
                 fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -197,15 +203,20 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
         return processors
 
     # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
         r"""
         Sets the attention processor to use to compute attention.
+
         Parameters:
             processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                 The instantiated processor class or a dictionary of processor classes that will be set as the processor
                 for **all** `Attention` layers.
+
                 If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                 processor. This is strongly recommended when setting trainable attention processors.
+
         """
         count = len(self.attn_processors.keys())
 
@@ -218,9 +229,9 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
         def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
             if hasattr(module, "set_processor"):
                 if not isinstance(processor, dict):
-                    module.set_processor(processor)
+                    module.set_processor(processor, _remove_lora=_remove_lora)
                 else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
 
             for sub_name, child in module.named_children():
                 fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -233,10 +244,33 @@ def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
         """
-        self.set_attn_processor(AttnProcessor())
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
 
     @apply_forward_hook
-    def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
+    def encode(
+        self, x: paddle.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
         # TODO junnyu, support float16
         x = x.cast(self.encoder.conv_in.weight.dtype)
         if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
@@ -247,6 +281,7 @@ def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOut
             h = paddle.concat(encoded_slices)
         else:
             h = self.encoder(x)
+
         moments = self.quant_conv(h)
         posterior = DiagonalGaussianDistribution(moments)
 
@@ -268,7 +303,23 @@ def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOu
         return DecoderOutput(sample=dec)
 
     @apply_forward_hook
-    def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+    def decode(
+        self, z: paddle.Tensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, paddle.Tensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
         # TODO junnyu, add this to support pure fp16
         z = z.cast(self.post_quant_conv.weight.dtype)
         if self.use_slicing and z.shape[0] > 1:
@@ -297,15 +348,18 @@ def blend_h(self, a, b, blend_extent):
 
     def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
         r"""Encode a batch of images using a tiled encoder.
+
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
         different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         output, but they should be much less noticeable.
+
         Args:
             x (`paddle.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
         Returns:
             [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
                 If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
@@ -349,10 +403,12 @@ def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> Autoencode
     def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
         r"""
         Decode a batch of images using a tiled decoder.
+
         Args:
             z (`paddle.Tensor`): Input batch of latent vectors.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
         Returns:
             [`~models.vae.DecoderOutput`] or `tuple`:
                 If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py b/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py
new file mode 100644
index 000000000..e16464b99
--- /dev/null
+++ b/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py
@@ -0,0 +1,347 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 Ollin Boer Bohan and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import paddle
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .modeling_utils import ModelMixin
+from .vae import DecoderOutput, DecoderTiny, EncoderTiny
+
+
+@dataclass
+class AutoencoderTinyOutput(BaseOutput):
+    """
+    Output of AutoencoderTiny encoding method.
+
+    Args:
+        latents (`paddle.Tensor`): Encoded outputs of the `Encoder`.
+
+    """
+
+    latents: paddle.Tensor
+
+
+class AutoencoderTiny(ModelMixin, ConfigMixin):
+    r"""
+    A tiny distilled VAE model for encoding images into latents and decoding latent representations into images.
+
+    [`AutoencoderTiny`] is a wrapper around the original implementation of `TAESD`.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for its generic methods implemented for
+    all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (`int`,  *optional*, defaults to 3): Number of channels in the output.
+        encoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each encoder block. The length of the
+            tuple should be equal to the number of encoder blocks.
+        decoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each decoder block. The length of the
+            tuple should be equal to the number of decoder blocks.
+        act_fn (`str`, *optional*, defaults to `"relu"`):
+            Activation function to be used throughout the model.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Number of channels in the latent representation. The latent space acts as a compressed representation of
+            the input image.
+        upsampling_scaling_factor (`int`, *optional*, defaults to 2):
+            Scaling factor for upsampling in the decoder. It determines the size of the output image during the
+            upsampling process.
+        num_encoder_blocks (`Tuple[int]`, *optional*, defaults to `(1, 3, 3, 3)`):
+            Tuple of integers representing the number of encoder blocks at each stage of the encoding process. The
+            length of the tuple should be equal to the number of stages in the encoder. Each stage has a different
+            number of encoder blocks.
+        num_decoder_blocks (`Tuple[int]`, *optional*, defaults to `(3, 3, 3, 1)`):
+            Tuple of integers representing the number of decoder blocks at each stage of the decoding process. The
+            length of the tuple should be equal to the number of stages in the decoder. Each stage has a different
+            number of decoder blocks.
+        latent_magnitude (`float`, *optional*, defaults to 3.0):
+            Magnitude of the latent representation. This parameter scales the latent representation values to control
+            the extent of information preservation.
+        latent_shift (float, *optional*, defaults to 0.5):
+            Shift applied to the latent representation. This parameter controls the center of the latent space.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. For this Autoencoder,
+            however, no such scaling factor was used, hence the value of 1.0 as the default.
+        force_upcast (`bool`, *optional*, default to `False`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without losing too much precision, in which case
+            `force_upcast` can be set to `False` (see this fp16-friendly
+            [AutoEncoder](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+    """
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=3,
+        encoder_block_out_channels: Tuple[int] = (64, 64, 64, 64),
+        decoder_block_out_channels: Tuple[int] = (64, 64, 64, 64),
+        act_fn: str = "relu",
+        latent_channels: int = 4,
+        upsampling_scaling_factor: int = 2,
+        num_encoder_blocks: Tuple[int] = (1, 3, 3, 3),
+        num_decoder_blocks: Tuple[int] = (3, 3, 3, 1),
+        latent_magnitude: int = 3,
+        latent_shift: float = 0.5,
+        force_upcast: float = False,
+        scaling_factor: float = 1.0,
+    ):
+        super().__init__()
+
+        if len(encoder_block_out_channels) != len(num_encoder_blocks):
+            raise ValueError("`encoder_block_out_channels` should have the same length as `num_encoder_blocks`.")
+        if len(decoder_block_out_channels) != len(num_decoder_blocks):
+            raise ValueError("`decoder_block_out_channels` should have the same length as `num_decoder_blocks`.")
+
+        self.encoder = EncoderTiny(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            num_blocks=num_encoder_blocks,
+            block_out_channels=encoder_block_out_channels,
+            act_fn=act_fn,
+        )
+
+        self.decoder = DecoderTiny(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            num_blocks=num_decoder_blocks,
+            block_out_channels=decoder_block_out_channels,
+            upsampling_scaling_factor=upsampling_scaling_factor,
+            act_fn=act_fn,
+        )
+
+        self.latent_magnitude = latent_magnitude
+        self.latent_shift = latent_shift
+        self.scaling_factor = scaling_factor
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.spatial_scale_factor = 2**out_channels
+        self.tile_overlap_factor = 0.125
+        self.tile_sample_min_size = 512
+        self.tile_latent_min_size = self.tile_sample_min_size // self.spatial_scale_factor
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncoderTiny, DecoderTiny)):
+            module.gradient_checkpointing = value
+
+    def scale_latents(self, x):
+        """raw latents -> [0, 1]"""
+        return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1)
+
+    def unscale_latents(self, x):
+        """[0, 1] -> raw latents"""
+        return x.sub(self.latent_shift).mul(2 * self.latent_magnitude)
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def _tiled_encode(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_tiny.AutoencoderTinyOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_tiny.AutoencoderTinyOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_tiny.AutoencoderTinyOutput`] is returned, otherwise a
+                plain `tuple` is returned.
+        """
+        # scale of encoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_sample_min_size
+
+        # number of pixels to blend and to traverse between tile
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+
+        # mask for blending
+        blend_masks = paddle.stack(
+            paddle.meshgrid([paddle.arange(tile_size / sf) / (blend_size / sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clamp(0, 1)
+
+        # output array
+        out = paddle.zeros([x.shape[0], 4, x.shape[-2] // sf, x.shape[-1] // sf])
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i // sf : (i + tile_size) // sf, j // sf : (j + tile_size) // sf]
+                tile = self.encoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = paddle.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = paddle.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = blend_mask_i * blend_mask_j
+                tile, blend_mask = tile[..., :h, :w], blend_mask[..., :h, :w]
+                # pytorch: tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+                tile_out = blend_mask * tile + (1 - blend_mask) * tile_out
+        return out
+
+    def _tiled_decode(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+
+        Args:
+            x (`paddle.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_tiny.AutoencoderTinyOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        # scale of decoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_latent_min_size
+
+        # number of pixels to blend and to traverse between tiles
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+
+        # mask for blending
+        blend_masks = paddle.stack(
+            paddle.meshgrid([paddle.arange(tile_size * sf) / (blend_size * sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clamp(0, 1).to(x.device)
+
+        # output array
+        out = paddle.zeros(x.shape[0], 3, x.shape[-2] * sf, x.shape[-1] * sf, device=x.device)
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i * sf : (i + tile_size) * sf, j * sf : (j + tile_size) * sf]
+                tile = self.decoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = paddle.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = paddle.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = (blend_mask_i * blend_mask_j)[..., :h, :w]
+                # pytorch: tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+                tile_out = blend_mask * tile + (1 - blend_mask) * tile_out
+        return out
+
+    def encode(self, x: paddle.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[paddle.Tensor]]:
+        if self.use_slicing and x.shape[0] > 1:
+            output = [self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x) for x_slice in x.split(1)]
+            output = paddle.concat(output)
+        else:
+            output = self._tiled_encode(x) if self.use_tiling else self.encoder(x)
+
+        if not return_dict:
+            return (output,)
+
+        return AutoencoderTinyOutput(latents=output)
+
+    def decode(
+        self, x: paddle.Tensor, generator: Optional[paddle.Generator] = None, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
+        if self.use_slicing and x.shape[0] > 1:
+            output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
+            output = paddle.concat(output)
+        else:
+            output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
+
+        if not return_dict:
+            return (output,)
+
+        return DecoderOutput(sample=output)
+
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
+        r"""
+        Args:
+            sample (`paddle.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        enc = self.encode(sample).latents
+
+        # scale latents to be in [0, 1], then quantize latents to a byte tensor,
+        # as if we were storing the latents in an RGBA uint8 image.
+        scaled_enc = self.scale_latents(enc).mul_(255).round_().byte()
+
+        # unquantize latents back into [0, 1], then unscale latents back to their original range,
+        # as if we were loading the latents from an RGBA uint8 image.
+        unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
+
+        dec = self.decode(unscaled_enc)
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/models/controlnet.py b/ppdiffusers/ppdiffusers/models/controlnet.py
index 091b4d5db..df9755fc2 100644
--- a/ppdiffusers/ppdiffusers/models/controlnet.py
+++ b/ppdiffusers/ppdiffusers/models/controlnet.py
@@ -23,7 +23,13 @@
 from ..initializer import zeros_
 from ..loaders import FromOriginalControlnetMixin
 from ..utils import BaseOutput, logging
-from .attention_processor import AttentionProcessor, AttnProcessor
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .embeddings import (
     TextImageProjection,
     TextImageTimeEmbedding,
@@ -47,6 +53,7 @@
 class ControlNetOutput(BaseOutput):
     """
     The output of [`ControlNetModel`].
+
     Args:
         down_block_res_samples (`tuple[paddle.Tensor]`):
             A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
@@ -69,12 +76,12 @@ def forward(self, hidden_states):
 
 class ControlNetConditioningEmbedding(nn.Layer):
     """
-    "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
-    images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
-    image-based conditions to 64 × 64 feature space to match the convolution size. We use a tiny network E(·) of four
-    convolution layers with 4 × 4 kernels and 2 × 2 strides (activated by ReLU, channels are 16, 32, 64, 128,
-    initialized with Gaussian weights, trained jointly with the full model) to encode image-space conditions ... into
-    feature maps ..."
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
     """
 
     def __init__(
@@ -115,6 +122,7 @@ def forward(self, conditioning):
 class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
     """
     A ControlNet model.
+
     Args:
         in_channels (`int`, defaults to 4):
             The number of channels in the input sample.
@@ -221,6 +229,7 @@ def __init__(
         resnet_pre_temb_non_linearity: bool = False,
     ):
         super().__init__()
+
         # If `num_attention_heads` is not defined (which is the case for most models)
         # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
         # The reason for this behavior is to correct for incorrectly named variables that were introduced
@@ -257,10 +266,8 @@ def __init__(
 
         # time
         time_embed_dim = block_out_channels[0] * 4
-
         self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
         timestep_input_dim = block_out_channels[0]
-
         self.time_embedding = TimestepEmbedding(
             timestep_input_dim,
             time_embed_dim,
@@ -451,6 +458,7 @@ def from_unet(
     ):
         r"""
         Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
+
         Parameters:
             unet (`UNet2DConditionModel`):
                 The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
@@ -465,6 +473,7 @@ def from_unet(
         addition_time_embed_dim = (
             unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
         )
+
         controlnet = cls(
             encoder_hid_dim=encoder_hid_dim,
             encoder_hid_dim_type=encoder_hid_dim_type,
@@ -511,6 +520,7 @@ def from_unet(
         return controlnet
 
     @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
         Returns:
@@ -521,8 +531,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         processors = {}
 
         def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
 
             for sub_name, child in module.named_children():
                 fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -534,15 +544,21 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
 
         return processors
 
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
         r"""
         Sets the attention processor to use to compute attention.
+
         Parameters:
             processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                 The instantiated processor class or a dictionary of processor classes that will be set as the processor
                 for **all** `Attention` layers.
+
                 If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                 processor. This is strongly recommended when setting trainable attention processors.
+
         """
         count = len(self.attn_processors.keys())
 
@@ -555,9 +571,9 @@ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, Atte
         def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
             if hasattr(module, "set_processor"):
                 if not isinstance(processor, dict):
-                    module.set_processor(processor)
+                    module.set_processor(processor, _remove_lora=_remove_lora)
                 else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
 
             for sub_name, child in module.named_children():
                 fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -570,13 +586,25 @@ def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
         """
-        self.set_attn_processor(AttnProcessor())
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
 
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
     def set_attention_slice(self, slice_size):
         r"""
         Enable sliced attention computation.
+
         When this option is enabled, the attention module splits the input tensor in slices to compute attention in
         several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
         Args:
             slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
                 When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
@@ -656,6 +684,7 @@ def forward(
     ) -> Union[ControlNetOutput, Tuple]:
         """
         The [`ControlNetModel`] forward method.
+
         Args:
             sample (`paddle.Tensor`):
                 The noisy input tensor.
@@ -670,7 +699,13 @@ def forward(
             class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
                 Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
             timestep_cond (`paddle.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
             attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
             added_cond_kwargs (`dict`):
                 Additional conditions for the Stable Diffusion XL UNet.
             cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
@@ -680,6 +715,7 @@ def forward(
                 you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
             return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+
         Returns:
             [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
                 If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
@@ -727,6 +763,7 @@ def forward(
         aug_emb = None
 
         emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
 
         if self.class_embedding is not None:
             if class_labels is None:
@@ -743,7 +780,7 @@ def forward(
             class_emb = self.class_embedding(class_labels).cast(sample.dtype)
             emb = emb + class_emb
 
-        if "addition_embed_type" in self.config:
+        if "addition_embed_type" in self.config and self.config.addition_embed_type is not None:
             if self.config.addition_embed_type == "text":
                 aug_emb = self.add_embedding(encoder_hidden_states)
 
@@ -776,12 +813,10 @@ def forward(
         sample = self.conv_in(sample)
 
         controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-
         sample = sample + controlnet_cond
 
         # 3. down
         down_block_res_samples = (sample,)
-
         for downsample_block in self.down_blocks:
             if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                 sample, res_samples = downsample_block(
diff --git a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
index d6f680e81..036fa40fa 100644
--- a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
@@ -116,6 +116,10 @@ def forward(
                 Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
             attention_mask (`paddle.Tensor`, *optional*):
                 Optional attention mask to be applied in Attention
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
 
diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index 6391fec12..ecc83b8e6 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -309,9 +309,13 @@ def __init__(self, embed_dim: int, max_seq_length: int = 32):
         super().__init__()
         position = paddle.arange(max_seq_length).unsqueeze(1)
         div_term = paddle.exp(paddle.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
-        pe = paddle.zeros(1, max_seq_length, embed_dim)
-        pe[0, :, 0::2] = paddle.sin(position * div_term)
-        pe[0, :, 1::2] = paddle.cos(position * div_term)
+        pe = paddle.zeros([1, max_seq_length, embed_dim])
+        pe[0, :, 0::2] = paddle.sin(
+            paddle.to_tensor(position * div_term, dtype=paddle.get_default_dtype())
+        )  # paddle: sin not support int64, convert to float32
+        pe[0, :, 1::2] = paddle.cos(
+            paddle.to_tensor(position * div_term, dtype=paddle.get_default_dtype())
+        )  # paddle: cos not support int64, convert to float32
         self.register_buffer("pe", pe)
 
     def forward(self, x):
diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
index 33986470c..7409275b6 100644
--- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
@@ -24,6 +24,28 @@
 
 
 class T5FilmDecoder(ModelMixin, ConfigMixin):
+    r"""
+    T5 style decoder with FiLM conditioning.
+
+    Args:
+        input_dims (`int`, *optional*, defaults to `128`):
+            The number of input dimensions.
+        targets_length (`int`, *optional*, defaults to `256`):
+            The length of the targets.
+        d_model (`int`, *optional*, defaults to `768`):
+            Size of the input hidden states.
+        num_layers (`int`, *optional*, defaults to `12`):
+            The number of `DecoderLayer`'s to use.
+        num_heads (`int`, *optional*, defaults to `12`):
+            The number of attention heads to use.
+        d_kv (`int`, *optional*, defaults to `64`):
+            Size of the key-value projection vectors.
+        d_ff (`int`, *optional*, defaults to `2048`):
+            The number of dimensions in the intermediate feed-forward layer of `DecoderLayer`'s.
+        dropout_rate (`float`, *optional*, defaults to `0.1`):
+            Dropout probability.
+    """
+
     @register_to_config
     def __init__(
         self,
@@ -70,7 +92,7 @@ def encoder_decoder_mask(self, query_input, key_input):
 
     def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
         batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape[0] == batch
+        assert decoder_noise_time.shape == (batch,)
 
         # decoder_noise_time is in [0, 1), so rescale to expected timing range.
         time_steps = get_timestep_embedding(
@@ -81,7 +103,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
 
-        assert conditioning_emb.shape == [batch, 1, self.config.d_model * 4]
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
 
         seq_length = decoder_input_tokens.shape[1]
 
@@ -125,7 +147,27 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
 
 class DecoderLayer(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
+    r"""
+    T5 decoder layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`, *optional*, defaults to `1e-6`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(
+        self, d_model: int, d_kv: int, num_heads: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float = 1e-6
+    ):
         super().__init__()
         self.layer = nn.LayerList()
 
@@ -183,7 +225,21 @@ def forward(
 
 
 class T5LayerSelfAttentionCond(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate):
+    r"""
+    T5 style self-attention layer with conditioning.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        dropout_rate (`float`):
+            Dropout probability.
+    """
+
+    def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float):
         super().__init__()
         self.layer_norm = T5LayerNorm(d_model)
         self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
@@ -211,7 +267,23 @@ def forward(
 
 
 class T5LayerCrossAttention(nn.Layer):
-    def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
+    r"""
+    T5 style cross-attention layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float, layer_norm_epsilon: float):
         super().__init__()
         self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
         self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
@@ -234,7 +306,21 @@ def forward(
 
 
 class T5LayerFFCond(nn.Layer):
-    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
+    r"""
+    T5 style feed-forward conditional layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, d_model: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float):
         super().__init__()
         self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
         self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
@@ -252,7 +338,19 @@ def forward(self, hidden_states, conditioning_emb=None):
 
 
 class T5DenseGatedActDense(nn.Layer):
-    def __init__(self, d_model, d_ff, dropout_rate):
+    r"""
+    T5 style feed-forward layer with gated activations and dropout.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+    """
+
+    def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
         super().__init__()
         self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False)
         self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False)
@@ -271,11 +369,20 @@ def forward(self, hidden_states):
 
 
 class T5LayerNorm(nn.Layer):
-    """
-    Construct a layernorm module in the T5 style No bias and no subtraction of mean.
+    r"""
+    T5 style layer normalization module.
+
+    Args:
+        hidden_size (`int`):
+            Size of the input hidden states.
+        eps (`float`, `optional`, defaults to `1e-6`):
+            A small value used for numerical stability to avoid dividing by zero.
     """
 
-    def __init__(self, hidden_size, eps=1e-6):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
         super().__init__()
         self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
         self.variance_epsilon = eps
@@ -309,10 +416,16 @@ def forward(self, input: paddle.Tensor) -> paddle.Tensor:
 
 class T5FiLMLayer(nn.Layer):
     """
-    FiLM Layer
+    T5 style FiLM Layer.
+
+    Args:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
     """
 
-    def __init__(self, in_features, out_features):
+    def __init__(self, in_features: int, out_features: int):
         super().__init__()
         self.scale_bias = nn.Linear(in_features, out_features * 2, bias_attr=False)
 
diff --git a/ppdiffusers/ppdiffusers/models/transformer_temporal.py b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
index 84847ffa5..75e80f6ab 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_temporal.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import paddle
 import paddle.nn as nn
@@ -50,13 +50,21 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
         num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
         cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
-        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
-            This is fixed during training since it is used to learn a number of position embeddings.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
         attention_bias (`bool`, *optional*):
             Configure if the `TransformerBlock` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
+            activation functions.
+        norm_elementwise_affine (`bool`, *optional*):
+            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
         double_self_attention (`bool`, *optional*):
             Configure if each `TransformerBlock` should contain two self-attention layers.
+        positional_embeddings: (`str`, *optional*):
+            The type of positional embeddings to apply to the sequence input before passing use.
+        num_positional_embeddings: (`int`, *optional*):
+            The maximum length of the sequence over which to apply positional embeddings.
     """
 
     @register_to_config
@@ -75,14 +83,19 @@ def __init__(
         activation_fn: str = "geglu",
         norm_elementwise_affine: bool = True,
         double_self_attention: bool = True,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
     ):
         super().__init__()
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
         inner_dim = num_attention_heads * attention_head_dim
+
         self.in_channels = in_channels
+
         self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
         self.proj_in = nn.Linear(in_channels, inner_dim)
+
         # 3. Define transformers blocks
         self.transformer_blocks = nn.LayerList(
             [
@@ -96,22 +109,25 @@ def __init__(
                     attention_bias=attention_bias,
                     double_self_attention=double_self_attention,
                     norm_elementwise_affine=norm_elementwise_affine,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=num_positional_embeddings,
                 )
                 for d in range(num_layers)
             ]
         )
+
         self.proj_out = nn.Linear(inner_dim, in_channels)
 
     def forward(
         self,
-        hidden_states,
-        encoder_hidden_states=None,
-        timestep=None,
-        class_labels=None,
-        num_frames=1,
-        cross_attention_kwargs=None,
+        hidden_states: paddle.Tensor,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        timestep: Optional[paddle.Tensor] = None,
+        class_labels: paddle.Tensor = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
-    ):
+    ) -> TransformerTemporalModelOutput:
         """
         The [`TransformerTemporal`] forward method.
 
@@ -126,6 +142,12 @@ def forward(
             class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
                 Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                 `AdaLayerZeroNorm`.
+            num_frames (`int`, *optional*, defaults to 1):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                 tuple.
@@ -138,14 +160,18 @@ def forward(
         # 1. Input
         batch_frames, channel, height, width = hidden_states.shape
         batch_size = batch_frames // num_frames
+
         residual = hidden_states
+
         hidden_states = hidden_states[None, :].reshape((batch_size, num_frames, channel, height, width))
         hidden_states = hidden_states.transpose([0, 2, 1, 3, 4])
+
         hidden_states = self.norm(hidden_states)
         hidden_states = hidden_states.transpose([0, 3, 4, 2, 1]).reshape(
             (batch_size * height * width, num_frames, channel)
         )
         hidden_states = self.proj_in(hidden_states)
+
         # 2. Blocks
         for block in self.transformer_blocks:
             hidden_states = block(
@@ -155,6 +181,7 @@ def forward(
                 cross_attention_kwargs=cross_attention_kwargs,
                 class_labels=class_labels,
             )
+
         # 3. Output
         hidden_states = self.proj_out(hidden_states)
         hidden_states = (
@@ -163,7 +190,10 @@ def forward(
             .transpose([0, 3, 4, 1, 2])
         )
         hidden_states = hidden_states.reshape((batch_frames, channel, height, width))
+
         output = hidden_states + residual
+
         if not return_dict:
             return (output,)
+
         return TransformerTemporalModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/unet_motion_model.py b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
new file mode 100644
index 000000000..71263cd04
--- /dev/null
+++ b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
@@ -0,0 +1,874 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import UNet2DConditionLoadersMixin
+from ..utils import logging
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .transformer_temporal import TransformerTemporalModel
+from .unet_2d_blocks import UNetMidBlock2DCrossAttn
+from .unet_2d_condition import UNet2DConditionModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlockMotion,
+    CrossAttnUpBlockMotion,
+    DownBlockMotion,
+    UNetMidBlockCrossAttnMotion,
+    UpBlockMotion,
+    get_down_block,
+    get_up_block,
+)
+from .unet_3d_condition import UNet3DConditionOutput
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class MotionModules(nn.Layer):
+    def __init__(
+        self,
+        in_channels,
+        layers_per_block=2,
+        num_attention_heads=8,
+        attention_bias=False,
+        cross_attention_dim=None,
+        activation_fn="geglu",
+        norm_num_groups=32,
+        max_seq_length=32,
+    ):
+        super().__init__()
+        self.motion_modules = nn.LayerList([])
+
+        for i in range(layers_per_block):
+            self.motion_modules.append(
+                TransformerTemporalModel(
+                    in_channels=in_channels,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=in_channels // num_attention_heads,
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=max_seq_length,
+                )
+            )
+
+
+class MotionAdapter(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        block_out_channels=(320, 640, 1280, 1280),
+        motion_layers_per_block=2,
+        motion_mid_block_layers_per_block=1,
+        motion_num_attention_heads=8,
+        motion_norm_num_groups=32,
+        motion_max_seq_length=32,
+        use_motion_mid_block=True,
+    ):
+        """Container to store AnimateDiff Motion Modules
+
+        Args:
+            block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each UNet block.
+            motion_layers_per_block (`int`, *optional*, defaults to 2):
+                The number of motion layers per UNet block.
+            motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
+                The number of motion layers in the middle UNet block.
+            motion_num_attention_heads (`int`, *optional*, defaults to 8):
+                The number of heads to use in each attention layer of the motion module.
+            motion_norm_num_groups (`int`, *optional*, defaults to 32):
+                The number of groups to use in each group normalization layer of the motion module.
+            motion_max_seq_length (`int`, *optional*, defaults to 32):
+                The maximum sequence length to use in the motion module.
+            use_motion_mid_block (`bool`, *optional*, defaults to True):
+                Whether to use a motion module in the middle of the UNet.
+        """
+
+        super().__init__()
+        down_blocks = []
+        up_blocks = []
+
+        for i, channel in enumerate(block_out_channels):
+            output_channel = block_out_channels[i]
+            down_blocks.append(
+                MotionModules(
+                    in_channels=output_channel,
+                    norm_num_groups=motion_norm_num_groups,
+                    cross_attention_dim=None,
+                    activation_fn="geglu",
+                    attention_bias=False,
+                    num_attention_heads=motion_num_attention_heads,
+                    max_seq_length=motion_max_seq_length,
+                    layers_per_block=motion_layers_per_block,
+                )
+            )
+
+        if use_motion_mid_block:
+            self.mid_block = MotionModules(
+                in_channels=block_out_channels[-1],
+                norm_num_groups=motion_norm_num_groups,
+                cross_attention_dim=None,
+                activation_fn="geglu",
+                attention_bias=False,
+                num_attention_heads=motion_num_attention_heads,
+                layers_per_block=motion_mid_block_layers_per_block,
+                max_seq_length=motion_max_seq_length,
+            )
+        else:
+            self.mid_block = None
+
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, channel in enumerate(reversed_block_out_channels):
+            output_channel = reversed_block_out_channels[i]
+            up_blocks.append(
+                MotionModules(
+                    in_channels=output_channel,
+                    norm_num_groups=motion_norm_num_groups,
+                    cross_attention_dim=None,
+                    activation_fn="geglu",
+                    attention_bias=False,
+                    num_attention_heads=motion_num_attention_heads,
+                    max_seq_length=motion_max_seq_length,
+                    layers_per_block=motion_layers_per_block + 1,
+                )
+            )
+
+        self.down_blocks = nn.LayerList(down_blocks)
+        self.up_blocks = nn.LayerList(up_blocks)
+
+    def forward(self, sample):
+        pass
+
+
+class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
+    sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "DownBlockMotion",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockMotion",
+            "CrossAttnUpBlockMotion",
+            "CrossAttnUpBlockMotion",
+            "CrossAttnUpBlockMotion",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        use_linear_projection: bool = False,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
+        motion_max_seq_length: Optional[int] = 32,
+        motion_num_attention_heads: int = 8,
+        use_motion_mid_block: int = True,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2D(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+
+        # class embedding
+        self.down_blocks = nn.LayerList([])
+        self.up_blocks = nn.LayerList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                dual_cross_attention=False,
+                temporal_num_attention_heads=motion_num_attention_heads,
+                temporal_max_seq_length=motion_max_seq_length,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if use_motion_mid_block:
+            self.mid_block = UNetMidBlockCrossAttnMotion(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=False,
+                temporal_num_attention_heads=motion_num_attention_heads,
+                temporal_max_seq_length=motion_max_seq_length,
+            )
+
+        else:
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=False,
+            )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=False,
+                resolution_idx=i,
+                use_linear_projection=use_linear_projection,
+                temporal_num_attention_heads=motion_num_attention_heads,
+                temporal_max_seq_length=motion_max_seq_length,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
+            )
+            self.conv_act = nn.Silu()
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2D(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @classmethod
+    def from_unet2d(
+        cls,
+        unet: UNet2DConditionModel,
+        motion_adapter: Optional[MotionAdapter] = None,
+        load_weights: bool = True,
+    ):
+        has_motion_adapter = motion_adapter is not None
+
+        # based on https://github.com/guoyww/AnimateDiff/blob/895f3220c06318ea0760131ec70408b466c49333/animatediff/models/unet.py#L459
+        config = unet.config
+        config["_class_name"] = cls.__name__
+
+        down_blocks = []
+        for down_blocks_type in config["down_block_types"]:
+            if "CrossAttn" in down_blocks_type:
+                down_blocks.append("CrossAttnDownBlockMotion")
+            else:
+                down_blocks.append("DownBlockMotion")
+        config["down_block_types"] = down_blocks
+
+        up_blocks = []
+        for down_blocks_type in config["up_block_types"]:
+            if "CrossAttn" in down_blocks_type:
+                up_blocks.append("CrossAttnUpBlockMotion")
+            else:
+                up_blocks.append("UpBlockMotion")
+
+        config["up_block_types"] = up_blocks
+
+        if has_motion_adapter:
+            config["motion_num_attention_heads"] = motion_adapter.config["motion_num_attention_heads"]
+            config["motion_max_seq_length"] = motion_adapter.config["motion_max_seq_length"]
+            config["use_motion_mid_block"] = motion_adapter.config["use_motion_mid_block"]
+
+        # Need this for backwards compatibility with UNet2DConditionModel checkpoints
+        if not config.get("num_attention_heads"):
+            config["num_attention_heads"] = config["attention_head_dim"]
+
+        model = cls.from_config(config)
+
+        if not load_weights:
+            return model
+
+        model.conv_in.load_dict(unet.conv_in.state_dict())
+        model.time_proj.load_dict(unet.time_proj.state_dict())
+        model.time_embedding.load_dict(unet.time_embedding.state_dict())
+
+        for i, down_block in enumerate(unet.down_blocks):
+            model.down_blocks[i].resnets.load_dict(down_block.resnets.state_dict())
+            if hasattr(model.down_blocks[i], "attentions"):
+                model.down_blocks[i].attentions.load_dict(down_block.attentions.state_dict())
+            if model.down_blocks[i].downsamplers:
+                model.down_blocks[i].downsamplers.load_dict(down_block.downsamplers.state_dict())
+
+        for i, up_block in enumerate(unet.up_blocks):
+            model.up_blocks[i].resnets.load_dict(up_block.resnets.state_dict())
+            if hasattr(model.up_blocks[i], "attentions"):
+                model.up_blocks[i].attentions.load_dict(up_block.attentions.state_dict())
+            if model.up_blocks[i].upsamplers:
+                model.up_blocks[i].upsamplers.load_dict(up_block.upsamplers.state_dict())
+
+        model.mid_block.resnets.load_dict(unet.mid_block.resnets.state_dict())
+        model.mid_block.attentions.load_dict(unet.mid_block.attentions.state_dict())
+
+        if unet.conv_norm_out is not None:
+            model.conv_norm_out.load_dict(unet.conv_norm_out.state_dict())
+        if unet.conv_act is not None:
+            model.conv_act.load_dict(unet.conv_act.state_dict())
+        model.conv_out.load_dict(unet.conv_out.state_dict())
+
+        if has_motion_adapter:
+            model.load_motion_modules(motion_adapter)
+
+        # ensure that the Motion UNet is the same dtype as the UNet2DConditionModel
+        model.to(unet.dtype)
+
+        return model
+
+    def freeze_unet2d_params(self):
+        """Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
+        unfrozen for fine tuning.
+        """
+        # Freeze everything
+        for param in self.parameters():
+            param.requires_grad = False
+
+        # Unfreeze Motion Modules
+        for down_block in self.down_blocks:
+            motion_modules = down_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        for up_block in self.up_blocks:
+            motion_modules = up_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        if hasattr(self.mid_block, "motion_modules"):
+            motion_modules = self.mid_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        return
+
+    def load_motion_modules(self, motion_adapter: Optional[MotionAdapter]):
+        for i, down_block in enumerate(motion_adapter.down_blocks):
+            self.down_blocks[i].motion_modules.load_dict(down_block.motion_modules.state_dict())
+        for i, up_block in enumerate(motion_adapter.up_blocks):
+            self.up_blocks[i].motion_modules.load_dict(up_block.motion_modules.state_dict())
+
+        # to support older motion modules that don't have a mid_block
+        if hasattr(self.mid_block, "motion_modules"):
+            self.mid_block.motion_modules.load_dict(motion_adapter.mid_block.motion_modules.state_dict())
+
+    def save_motion_modules(
+        self,
+        save_directory: str,
+        is_main_process: bool = True,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        state_dict = self.state_dict()
+
+        # Extract all motion modules
+        motion_state_dict = {}
+        for k, v in state_dict.items():
+            if "motion_modules" in k:
+                motion_state_dict[k] = v
+
+        adapter = MotionAdapter(
+            block_out_channels=self.config["block_out_channels"],
+            motion_layers_per_block=self.config["layers_per_block"],
+            motion_norm_num_groups=self.config["norm_num_groups"],
+            motion_num_attention_heads=self.config["motion_num_attention_heads"],
+            motion_max_seq_length=self.config["motion_max_seq_length"],
+            use_motion_mid_block=self.config["use_motion_mid_block"],
+        )
+        adapter.load_dict(motion_state_dict)
+        adapter.save_pretrained(
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            safe_serialization=safe_serialization,
+            variant=variant,
+            push_to_hub=push_to_hub,
+            **kwargs,
+        )
+
+    @property
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: paddle.nn.Layer, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from ppdiffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size=None, dim=0):
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: paddle.nn.Layer, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    # Copied from ppdiffusers.models.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: paddle.nn.Layer, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, CrossAttnUpBlockMotion, UpBlockMotion)):
+            module.gradient_checkpointing = value
+
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def forward(
+        self,
+        sample: paddle.Tensor,
+        timestep: Union[paddle.Tensor, float, int],
+        encoder_hidden_states: paddle.Tensor,
+        timestep_cond: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
+        mid_block_additional_residual: Optional[paddle.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        The [`UNetMotionModel`] forward method.
+
+        Args:
+            sample (`paddle.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+            timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`paddle.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            timestep_cond: (`paddle.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            down_block_additional_residuals: (`tuple` of `paddle.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`paddle.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not isinstance(timesteps, paddle.Tensor):
+            # # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # # This would be a good case for the `match` statement (Python 3.10+)
+            # is_mps = sample.device.type == "mps"
+            # if isinstance(timestep, float):
+            #     dtype = paddle.float32 if is_mps else paddle.float64
+            # else:
+            #     dtype = paddle.int32 if is_mps else paddle.int64
+            timesteps = paddle.to_tensor([timesteps], dtype=paddle.int64, place=sample.place)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None]
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(repeats=num_frames, axis=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
+
+        # 2. pre-process
+        shape1 = [sample.shape[0] * num_frames, -1] + sample.shape[3:]
+        sample = sample.transpose((0, 2, 1, 3, 4)).reshape(shape1)
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            # To support older versions of motion modules that don't have a mid_block
+            if hasattr(self.mid_block, "motion_modules"):
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape([-1, num_frames] + sample.shape[1:]).transpose((0, 2, 1, 3, 4))
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet3DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/vae.py b/ppdiffusers/ppdiffusers/models/vae.py
index 019cacee0..3b08d1bf2 100644
--- a/ppdiffusers/ppdiffusers/models/vae.py
+++ b/ppdiffusers/ppdiffusers/models/vae.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Tuple
 
 import numpy as np
 import paddle
@@ -21,8 +21,14 @@
 from paddle.distributed.fleet.utils import recompute
 
 from ..utils import BaseOutput, randn_tensor
+from .activations import get_activation
 from .attention_processor import SpatialNorm
-from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+from .unet_2d_blocks import (
+    AutoencoderTinyBlock,
+    UNetMidBlock2D,
+    get_down_block,
+    get_up_block,
+)
 
 try:
     from paddle.amp.auto_cast import amp_state
@@ -53,16 +59,39 @@ class DecoderOutput(BaseOutput):
 
 
 class Encoder(nn.Layer):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+
     def __init__(
         self,
-        in_channels=3,
-        out_channels=3,
-        down_block_types=("DownEncoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        norm_num_groups=32,
-        act_fn="silu",
-        double_z=True,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -116,8 +145,9 @@ def __init__(
         self.conv_out = nn.Conv2D(block_out_channels[-1], conv_out_channels, 3, padding=1)
         self.gradient_checkpointing = False
 
-    def forward(self, x):
-        sample = x
+    def forward(self, sample: paddle.Tensor) -> paddle.Tensor:
+        r"""The forward method of the `Encoder` class."""
+
         sample = self.conv_in(sample)
 
         if self.training and self.gradient_checkpointing and not sample.stop_gradient:
@@ -152,16 +182,38 @@ def custom_forward(*inputs):
 
 
 class Decoder(nn.Layer):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
     def __init__(
         self,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        norm_num_groups=32,
-        act_fn="silu",
-        norm_type="group",  # group, spatial
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -221,8 +273,9 @@ def __init__(
         self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, 3, padding=1)
         self.gradient_checkpointing = False
 
-    def forward(self, z, latent_embeds=None):
-        sample = z
+    def forward(self, sample: paddle.Tensor, latent_embeds: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+        r"""The forward method of the `Decoder` class."""
+
         sample = self.conv_in(sample)
 
         upscale_dtype = self.up_blocks.dtype
@@ -270,6 +323,16 @@ def custom_forward(*inputs):
 
 
 class UpSample(nn.Layer):
+    r"""
+    The `UpSample` layer of a variational autoencoder that upsamples its input.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+    """
+
     def __init__(
         self,
         in_channels: int,
@@ -281,6 +344,7 @@ def __init__(
         self.deconv = nn.Conv2DTranspose(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
 
     def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""The forward method of the `UpSample` class."""
         x = paddle.nn.functional.relu(x)
         x = self.deconv(x)
         return x
@@ -329,6 +393,7 @@ def __init__(
         self.layers = nn.Sequential(*layers)
 
     def forward(self, x: paddle.Tensor, mask=None) -> paddle.Tensor:
+        r"""The forward method of the `MaskConditionEncoder` class."""
         out = {}
         for l in range(len(self.layers)):
             layer = self.layers[l]
@@ -339,19 +404,38 @@ def forward(self, x: paddle.Tensor, mask=None) -> paddle.Tensor:
 
 
 class MaskConditionDecoder(nn.Layer):
-    """The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
-    decoder with a conditioner on the mask and masked image."""
+    r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
+    decoder with a conditioner on the mask and masked image.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
 
     def __init__(
         self,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        norm_num_groups=32,
-        act_fn="silu",
-        norm_type="group",  # group, spatial
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
     ):
         super().__init__()
         self.layers_per_block = layers_per_block
@@ -426,7 +510,14 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def forward(self, z, image=None, mask=None, latent_embeds=None):
+    def forward(
+        self,
+        z: paddle.Tensor,
+        image: Optional[paddle.Tensor] = None,
+        mask: Optional[paddle.Tensor] = None,
+        latent_embeds: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        r"""The forward method of the `MaskConditionDecoder` class."""
         sample = z
         sample = self.conv_in(sample)
 
@@ -498,7 +589,14 @@ class VectorQuantizer(nn.Layer):
     # backwards compatibility we use the buggy version by default, but you can
     # specify legacy=False to fix it.
     def __init__(
-        self, n_e, vq_embed_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True
+        self,
+        n_e: int,
+        vq_embed_dim: int,
+        beta: float,
+        remap=None,
+        unknown_index: str = "random",
+        sane_index_shape: bool = False,
+        legacy: bool = True,
     ):
         super().__init__()
         self.n_e = n_e
@@ -527,7 +625,7 @@ def __init__(
 
         self.sane_index_shape = sane_index_shape
 
-    def remap_to_used(self, inds):
+    def remap_to_used(self, inds: paddle.Tensor) -> paddle.Tensor:
         ishape = inds.shape
         assert len(ishape) > 1
         inds = inds.reshape([ishape[0], -1])
@@ -541,7 +639,7 @@ def remap_to_used(self, inds):
             new[unknown] = self.unknown_index
         return new.reshape(ishape)
 
-    def unmap_to_all(self, inds):
+    def unmap_to_all(self, inds: paddle.Tensor) -> paddle.Tensor:
         ishape = inds.shape
         assert len(ishape) > 1
         inds = inds.reshape([ishape[0], -1])
@@ -551,7 +649,7 @@ def unmap_to_all(self, inds):
         back = paddle.take_along_axis(used[None, :][inds.shape[0] * [0], :], inds, axis=1)
         return back.reshape(ishape)
 
-    def forward(self, z):
+    def forward(self, z: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor, Tuple]:
         # reshape z -> (batch, height, width, channel) and flatten
         z = z.transpose([0, 2, 3, 1])
         z_flattened = z.reshape([-1, self.vq_embed_dim])
@@ -590,7 +688,7 @@ def forward(self, z):
 
         return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
 
-    def get_codebook_entry(self, indices, shape):
+    def get_codebook_entry(self, indices: paddle.Tensor, shape: Tuple[int, ...]) -> paddle.Tensor:
         # shape specifying (batch, height, width, channel)
         if self.remap is not None:
             indices = indices.reshape([shape[0], -1])  # add batch axis
@@ -613,7 +711,7 @@ def get_codebook_entry(self, indices, shape):
 
 
 class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters, deterministic=False):
+    def __init__(self, parameters: paddle.Tensor, deterministic: bool = False):
         self.parameters = parameters
         self.mean, self.logvar = paddle.chunk(parameters, 2, axis=1)
         self.logvar = paddle.clip(self.logvar, -30.0, 20.0)
@@ -629,7 +727,7 @@ def sample(self, generator: Optional[paddle.Generator] = None) -> paddle.Tensor:
         x = self.mean + self.std * sample
         return x
 
-    def kl(self, other=None):
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> paddle.Tensor:
         if self.deterministic:
             return paddle.to_tensor([0.0])
         else:
@@ -653,3 +751,141 @@ def nll(self, sample, axis=[1, 2, 3]):
 
     def mode(self):
         return self.mean
+
+
+class EncoderTiny(nn.Layer):
+    r"""
+    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = []
+        for i, num_block in enumerate(num_blocks):
+            num_channels = block_out_channels[i]
+
+            if i == 0:
+                layers.append(nn.Conv2D(in_channels, num_channels, kernel_size=3, padding=1))
+            else:
+                layers.append(nn.Conv2D(num_channels, num_channels, kernel_size=3, padding=1, stride=2, bias=False))
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+        layers.append(nn.Conv2D(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""The forward method of the `EncoderTiny` class."""
+        if self.training and self.gradient_checkpointing and not x.stop_gradient:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            x = recompute(create_custom_forward(self.layers), x)
+
+        else:
+            # scale image from [-1, 1] to [0, 1] to match TAESD convention
+            x = self.layers(x.add(1).div(2))
+
+        return x
+
+
+class DecoderTiny(nn.Layer):
+    r"""
+    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        upsampling_scaling_factor (`int`):
+            The scaling factor to use for upsampling.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        upsampling_scaling_factor: int,
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = [
+            nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, padding=1),
+            get_activation(act_fn),
+        ]
+
+        for i, num_block in enumerate(num_blocks):
+            is_final_block = i == (len(num_blocks) - 1)
+            num_channels = block_out_channels[i]
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+            if not is_final_block:
+                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor))
+
+            conv_out_channel = num_channels if not is_final_block else out_channels
+            layers.append(nn.Conv2d(num_channels, conv_out_channel, kernel_size=3, padding=1, bias=is_final_block))
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        r"""The forward method of the `DecoderTiny` class."""
+        # Clamp.
+        x = paddle.tanh(x / 3) * 3
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            x = recompute(create_custom_forward(self.layers), x)
+
+        else:
+            x = self.layers(x)
+
+        # scale image from [0, 1] to [-1, 1] to match diffusers convention
+        return x.multiply(paddle.to_tensor(2)).subtract(paddle.to_tensor(1))
diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py
index 1d6dd5a60..ec67e6c75 100644
--- a/ppdiffusers/ppdiffusers/models/vq_model.py
+++ b/ppdiffusers/ppdiffusers/models/vq_model.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import paddle
 import paddle.nn as nn
@@ -53,10 +53,12 @@ class VQModel(ModelMixin, ConfigMixin):
             Tuple of upsample block types.
         block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
             Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
         act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
         latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
         sample_size (`int`, *optional*, defaults to `32`): Sample input size.
         num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
+        norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
         vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
         scaling_factor (`float`, *optional*, defaults to `0.18215`):
             The component-wise standard deviation of the trained latent space computed using the first batch of the
@@ -65,6 +67,8 @@ class VQModel(ModelMixin, ConfigMixin):
             diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
             / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
             Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
     """
 
     @register_to_config
@@ -72,9 +76,9 @@ def __init__(
         self,
         in_channels: int = 3,
         out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
         layers_per_block: int = 1,
         act_fn: str = "silu",
         latent_channels: int = 3,
@@ -128,9 +132,9 @@ def encode(self, x: paddle.Tensor, return_dict: bool = True):
         return VQEncoderOutput(latents=h)
 
     @apply_forward_hook
-    def decode(self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict: bool = True):
-        # cast h to float16 / float32
-        h = h.cast(self.dtype)
+    def decode(
+        self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict: bool = True
+    ) -> Union[DecoderOutput, paddle.Tensor]:
         # also go through quantization layer
         if not force_not_quantize:
             quant, emb_loss, info = self.quantize(h)
@@ -144,7 +148,7 @@ def decode(self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict
 
         return DecoderOutput(sample=dec)
 
-    def forward(self, sample: paddle.Tensor, return_dict: bool = True):
+    def forward(self, sample: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
         r"""
         The [`VQModel`] forward method.
 
@@ -158,8 +162,8 @@ def forward(self, sample: paddle.Tensor, return_dict: bool = True):
                 If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
                 is returned.
         """
-        x = sample
-        h = self.encode(x).latents
+
+        h = self.encode(sample).latents
         dec = self.decode(h).sample
 
         if not return_dict:
diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
new file mode 100644
index 000000000..94a6b9a07
--- /dev/null
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import paddle
+
+from ppdiffusers import MotionAdapter, UNet2DConditionModel, UNetMotionModel
+from ppdiffusers.utils import logging
+from ppdiffusers.utils.testing_utils import enable_full_determinism, floats_tensor
+
+from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
+
+logger = logging.get_logger(__name__)
+
+enable_full_determinism()
+
+
+class UNetMotionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
+    model_class = UNetMotionModel
+    main_input_name = "sample"
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        num_frames = 8
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels, num_frames) + sizes)
+        time_step = paddle.to_tensor([10])
+        encoder_hidden_states = floats_tensor((batch_size, 4, 32))
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    @property
+    def input_shape(self):
+        return (4, 8, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 8, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (32, 64),
+            "down_block_types": ("CrossAttnDownBlockMotion", "DownBlockMotion"),
+            "up_block_types": ("UpBlockMotion", "CrossAttnUpBlockMotion"),
+            "cross_attention_dim": 32,
+            "num_attention_heads": 4,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 1,
+            "sample_size": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_unet2d(self):
+        paddle.seed(0)
+        unet2d = UNet2DConditionModel()
+
+        paddle.seed(1)
+        model = self.model_class.from_unet2d(unet2d)
+        model_state_dict = model.state_dict()
+
+        for param_name, param_value in unet2d.named_parameters():
+            self.assertTrue(paddle.equal(model_state_dict[param_name], param_value))
+
+    def test_freeze_unet2d(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.freeze_unet2d_params()
+
+        for param_name, param_value in model.named_parameters():
+            if "motion_modules" not in param_name:
+                self.assertFalse(not param_value.stop_gradient)
+
+            else:
+                self.assertTrue(not param_value.stop_gradient)
+
+    def test_loading_motion_adapter(self):
+        model = self.model_class()
+        adapter = MotionAdapter()
+        model.load_motion_modules(adapter)
+
+        for idx, down_block in enumerate(model.down_blocks):
+            adapter_state_dict = adapter.down_blocks[idx].motion_modules.state_dict()
+            for param_name, param_value in down_block.motion_modules.named_parameters():
+                self.assertTrue(paddle.equal(adapter_state_dict[param_name], param_value))
+
+        for idx, up_block in enumerate(model.up_blocks):
+            adapter_state_dict = adapter.up_blocks[idx].motion_modules.state_dict()
+            for param_name, param_value in up_block.motion_modules.named_parameters():
+                self.assertTrue(paddle.equal(adapter_state_dict[param_name], param_value))
+
+        mid_block_adapter_state_dict = adapter.mid_block.motion_modules.state_dict()
+        for param_name, param_value in model.mid_block.motion_modules.named_parameters():
+            self.assertTrue(paddle.equal(mid_block_adapter_state_dict[param_name], param_value))
+
+    def test_saving_motion_modules(self):
+        paddle.seed(0)
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_motion_modules(tmpdirname)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "diffusion_pytorch_model.safetensors")))
+
+            adapter_loaded = MotionAdapter.from_pretrained(tmpdirname)
+
+            paddle.seed(0)
+            model_loaded = self.model_class(**init_dict)
+            model_loaded.load_motion_modules(adapter_loaded)
+            model_loaded
+
+        with paddle.no_grad():
+            output = model(**inputs_dict)[0]
+            output_loaded = model_loaded(**inputs_dict)[0]
+
+        max_diff = (output - output_loaded).abs().max().item()
+        self.assertLessEqual(max_diff, 1e-4, "Models give different forward passes")
+
+    # @unittest.skipIf(
+    #     torch_device != "cuda" or not is_xformers_available(),
+    #     reason="XFormers attention is only available with CUDA and `xformers` installed",
+    # )
+    # def test_xformers_enable_works(self):
+    #     init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+    #     model = self.model_class(**init_dict)
+
+    #     model.enable_xformers_memory_efficient_attention()
+
+    #     assert (
+    #         model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
+    #         == "XFormersAttnProcessor"
+    #     ), "xformers is not enabled"
+
+    def test_gradient_checkpointing_is_applied(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model_class_copy = copy.copy(self.model_class)
+
+        modules_with_gc_enabled = {}
+
+        # now monkey patch the following function:
+        #     def _set_gradient_checkpointing(self, module, value=False):
+        #         if hasattr(module, "gradient_checkpointing"):
+        #             module.gradient_checkpointing = value
+
+        def _set_gradient_checkpointing_new(self, module, value=False):
+            if hasattr(module, "gradient_checkpointing"):
+                module.gradient_checkpointing = value
+                modules_with_gc_enabled[module.__class__.__name__] = True
+
+        model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
+
+        model = model_class_copy(**init_dict)
+        model.enable_gradient_checkpointing()
+
+        EXPECTED_SET = {
+            "CrossAttnUpBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "UNetMidBlockCrossAttnMotion",
+            "UpBlockMotion",
+            "Transformer2DModel",
+            "DownBlockMotion",
+        }
+
+        assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
+        assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
+
+    def test_feed_forward_chunking(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        init_dict["norm_num_groups"] = 32
+
+        model = self.model_class(**init_dict)
+        model
+        model.eval()
+
+        with paddle.no_grad():
+            output = model(**inputs_dict)[0]
+
+        model.enable_forward_chunking()
+        with paddle.no_grad():
+            output_2 = model(**inputs_dict)[0]
+
+        self.assertEqual(output.shape, output_2.shape, "Shape doesn't match")
+        assert np.abs(output.cpu() - output_2.cpu()).max() < 1e-2
+
+    def test_pickle(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model
+
+        with paddle.no_grad():
+            sample = model(**inputs_dict).sample
+
+        sample_copy = copy.copy(sample)
+
+        assert (sample - sample_copy).abs().max() < 1e-4
+
+    def test_from_save_pretrained(self, expected_max_diff=5e-5):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+        model
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, safe_serialization=False)
+            paddle.seed(0)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            new_model
+
+        with paddle.no_grad():
+            image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        paddle.seed(0)
+        model = self.model_class(**init_dict)
+        model
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, variant="fp16", safe_serialization=False)
+
+            paddle.seed(0)
+            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
+            # non-variant cannot be loaded
+            with self.assertRaises(OSError) as error_context:
+                self.model_class.from_pretrained(tmpdirname)
+
+            # make sure that error message states what keys are missing
+            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
+
+            new_model
+
+        with paddle.no_grad():
+            image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image.to_tuple()[0]
+
+            new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image.to_tuple()[0]
+
+        max_diff = (image - new_image).abs().max().item()
+        self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+
+    def test_forward_with_norm_groups(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["norm_num_groups"] = 16
+        init_dict["block_out_channels"] = (16, 32)
+
+        model = self.model_class(**init_dict)
+        model.eval()
+
+        with paddle.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.to_tuple()[0]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")

From 41b6fc677c0973605ac418fcfe5ae8f59a1aec7a Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 7 Dec 2023 10:03:19 +0800
Subject: [PATCH 12/19] Fix

---
 .../ppdiffusers/models/modeling_utils.py      |  2 ++
 .../ppdiffusers/models/unet_motion_model.py   |  2 +-
 .../tests/models/test_models_unet_motion.py   | 22 ++++++-------------
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py
index a86140987..8d81a7f6f 100644
--- a/ppdiffusers/ppdiffusers/models/modeling_utils.py
+++ b/ppdiffusers/ppdiffusers/models/modeling_utils.py
@@ -251,6 +251,8 @@ def save_pretrained(
         safe_serialization: bool = False,
         variant: Optional[str] = None,
         to_diffusers: Optional[bool] = None,
+        push_to_hub: bool = False,
+        **kwargs,
     ):
         """
         Save a model and its configuration file to a directory so that it can be reloaded using the
diff --git a/ppdiffusers/ppdiffusers/models/unet_motion_model.py b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
index 71263cd04..60a1e683e 100644
--- a/ppdiffusers/ppdiffusers/models/unet_motion_model.py
+++ b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
@@ -443,7 +443,7 @@ def from_unet2d(
             model.load_motion_modules(motion_adapter)
 
         # ensure that the Motion UNet is the same dtype as the UNet2DConditionModel
-        model.to(unet.dtype)
+        model.to(dtype=unet.dtype)
 
         return model
 
diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
index 94a6b9a07..78d68fa74 100644
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -81,7 +81,7 @@ def test_from_unet2d(self):
         model_state_dict = model.state_dict()
 
         for param_name, param_value in unet2d.named_parameters():
-            self.assertTrue(paddle.equal(model_state_dict[param_name], param_value))
+            self.assertTrue(paddle.equal_all(model_state_dict[param_name], param_value))
 
     def test_freeze_unet2d(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -103,33 +103,32 @@ def test_loading_motion_adapter(self):
         for idx, down_block in enumerate(model.down_blocks):
             adapter_state_dict = adapter.down_blocks[idx].motion_modules.state_dict()
             for param_name, param_value in down_block.motion_modules.named_parameters():
-                self.assertTrue(paddle.equal(adapter_state_dict[param_name], param_value))
+                self.assertTrue(paddle.equal_all(adapter_state_dict[param_name], param_value))
 
         for idx, up_block in enumerate(model.up_blocks):
             adapter_state_dict = adapter.up_blocks[idx].motion_modules.state_dict()
             for param_name, param_value in up_block.motion_modules.named_parameters():
-                self.assertTrue(paddle.equal(adapter_state_dict[param_name], param_value))
+                self.assertTrue(paddle.equal_all(adapter_state_dict[param_name], param_value))
 
         mid_block_adapter_state_dict = adapter.mid_block.motion_modules.state_dict()
         for param_name, param_value in model.mid_block.motion_modules.named_parameters():
-            self.assertTrue(paddle.equal(mid_block_adapter_state_dict[param_name], param_value))
+            self.assertTrue(paddle.equal_all(mid_block_adapter_state_dict[param_name], param_value))
 
     def test_saving_motion_modules(self):
         paddle.seed(0)
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict)
-        model
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_motion_modules(tmpdirname)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "diffusion_pytorch_model.safetensors")))
+            # pytorch: diffusion_pytorch_model.safetensors
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "model_state.pdparams")))
 
             adapter_loaded = MotionAdapter.from_pretrained(tmpdirname)
 
             paddle.seed(0)
             model_loaded = self.model_class(**init_dict)
             model_loaded.load_motion_modules(adapter_loaded)
-            model_loaded
 
         with paddle.no_grad():
             output = model(**inputs_dict)[0]
@@ -191,7 +190,6 @@ def test_feed_forward_chunking(self):
         init_dict["norm_num_groups"] = 32
 
         model = self.model_class(**init_dict)
-        model
         model.eval()
 
         with paddle.no_grad():
@@ -208,7 +206,6 @@ def test_pickle(self):
         # enable deterministic behavior for gradient checkpointing
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
         model = self.model_class(**init_dict)
-        model
 
         with paddle.no_grad():
             sample = model(**inputs_dict).sample
@@ -222,14 +219,12 @@ def test_from_save_pretrained(self, expected_max_diff=5e-5):
 
         paddle.seed(0)
         model = self.model_class(**init_dict)
-        model
         model.eval()
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_pretrained(tmpdirname, safe_serialization=False)
             paddle.seed(0)
             new_model = self.model_class.from_pretrained(tmpdirname)
-            new_model
 
         with paddle.no_grad():
             image = model(**inputs_dict)
@@ -249,7 +244,6 @@ def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
 
         paddle.seed(0)
         model = self.model_class(**init_dict)
-        model
         model.eval()
 
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -262,9 +256,7 @@ def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
                 self.model_class.from_pretrained(tmpdirname)
 
             # make sure that error message states what keys are missing
-            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
-
-            new_model
+            assert "Error no file named model_state.fp16.pdparams found in directory" in str(error_context.exception)
 
         with paddle.no_grad():
             image = model(**inputs_dict)

From 58f3f0fe1dbc3f1354feb5f5404eae489b024e72 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 7 Dec 2023 10:22:27 +0800
Subject: [PATCH 13/19] Fix

---
 .../ppdiffusers/models/unet_motion_model.py   |  8 +++----
 .../tests/models/test_models_unet_motion.py   | 21 +++++++++----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/unet_motion_model.py b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
index 60a1e683e..b521e9579 100644
--- a/ppdiffusers/ppdiffusers/models/unet_motion_model.py
+++ b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
@@ -453,23 +453,23 @@ def freeze_unet2d_params(self):
         """
         # Freeze everything
         for param in self.parameters():
-            param.requires_grad = False
+            param.stop_gradient = True
 
         # Unfreeze Motion Modules
         for down_block in self.down_blocks:
             motion_modules = down_block.motion_modules
             for param in motion_modules.parameters():
-                param.requires_grad = True
+                param.stop_gradient = False
 
         for up_block in self.up_blocks:
             motion_modules = up_block.motion_modules
             for param in motion_modules.parameters():
-                param.requires_grad = True
+                param.stop_gradient = False
 
         if hasattr(self.mid_block, "motion_modules"):
             motion_modules = self.mid_block.motion_modules
             for param in motion_modules.parameters():
-                param.requires_grad = True
+                param.stop_gradient = False
 
         return
 
diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
index 78d68fa74..26055ea77 100644
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -18,7 +18,6 @@
 import tempfile
 import unittest
 
-import numpy as np
 import paddle
 
 from ppdiffusers import MotionAdapter, UNet2DConditionModel, UNetMotionModel
@@ -91,7 +90,6 @@ def test_freeze_unet2d(self):
         for param_name, param_value in model.named_parameters():
             if "motion_modules" not in param_name:
                 self.assertFalse(not param_value.stop_gradient)
-
             else:
                 self.assertTrue(not param_value.stop_gradient)
 
@@ -200,19 +198,20 @@ def test_feed_forward_chunking(self):
             output_2 = model(**inputs_dict)[0]
 
         self.assertEqual(output.shape, output_2.shape, "Shape doesn't match")
-        assert np.abs(output.cpu() - output_2.cpu()).max() < 1e-2
+        assert paddle.abs(output.cpu() - output_2.cpu()).max() < 1e-2
 
-    def test_pickle(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
+    # paddle.Tensor not support pickle
+    # def test_pickle(self):
+    #     # enable deterministic behavior for gradient checkpointing
+    #     init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+    #     model = self.model_class(**init_dict)
 
-        with paddle.no_grad():
-            sample = model(**inputs_dict).sample
+    #     with paddle.no_grad():
+    #         sample = model(**inputs_dict).sample
 
-        sample_copy = copy.copy(sample)
+    #     sample_copy = copy.copy(sample)
 
-        assert (sample - sample_copy).abs().max() < 1e-4
+    #     assert (sample - sample_copy).abs().max() < 1e-4
 
     def test_from_save_pretrained(self, expected_max_diff=5e-5):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

From afb402f361d545614797b5a358f7ccb7a8a5d574 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 7 Dec 2023 10:25:35 +0800
Subject: [PATCH 14/19] Fix

---
 ppdiffusers/tests/models/test_models_unet_motion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
index 26055ea77..cfb5ab1cf 100644
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -238,7 +238,8 @@ def test_from_save_pretrained(self, expected_max_diff=5e-5):
         max_diff = (image - new_image).abs().max().item()
         self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
 
-    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
+    # test_modeling_common.py has the same name test: test_from_save_pretrained_variant
+    def test_from_save_pretrained_variant_motion(self, expected_max_diff=5e-5):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
         paddle.seed(0)

From 8a2f8e9753e291716f71c3fb5bd222eef26cb1b3 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 7 Dec 2023 10:29:03 +0800
Subject: [PATCH 15/19] Fix

---
 ppdiffusers/tests/models/test_models_unet_motion.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
index cfb5ab1cf..11c730095 100644
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -238,8 +238,7 @@ def test_from_save_pretrained(self, expected_max_diff=5e-5):
         max_diff = (image - new_image).abs().max().item()
         self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
 
-    # test_modeling_common.py has the same name test: test_from_save_pretrained_variant
-    def test_from_save_pretrained_variant_motion(self, expected_max_diff=5e-5):
+    def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
         paddle.seed(0)
@@ -256,7 +255,7 @@ def test_from_save_pretrained_variant_motion(self, expected_max_diff=5e-5):
                 self.model_class.from_pretrained(tmpdirname)
 
             # make sure that error message states what keys are missing
-            assert "Error no file named model_state.fp16.pdparams found in directory" in str(error_context.exception)
+            assert "Error no file named model_state.pdparams found in directory" in str(error_context.exception)
 
         with paddle.no_grad():
             image = model(**inputs_dict)

From a673b5f187b9328852c002c38e002146eb502b2e Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Thu, 7 Dec 2023 11:33:03 +0800
Subject: [PATCH 16/19] Fix

---
 ppdiffusers/ppdiffusers/models/unet_motion_model.py             | 2 +-
 .../tests/pipelines/stable_diffusion/test_stable_diffusion.py   | 2 +-
 .../pipelines/stable_diffusion/test_stable_diffusion_inpaint.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/unet_motion_model.py b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
index b521e9579..5a74528d5 100644
--- a/ppdiffusers/ppdiffusers/models/unet_motion_model.py
+++ b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
@@ -763,7 +763,7 @@ def forward(
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = t_emb.cast(dtype=self.dtype)  # paddle develop has .to()
 
         emb = self.time_embedding(t_emb, timestep_cond)
         emb = emb.repeat_interleave(repeats=num_frames, axis=0)
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 8bae56e28..f355e1804 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -37,7 +37,7 @@
 from ppdiffusers.utils import nightly, slow
 from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
 
-from ...lora.test_lora_layers import create_lora_layers
+from ...lora.test_lora_layers_old_backend import create_lora_layers
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
 
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 98a974cfa..965f70e60 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -40,7 +40,7 @@
     require_paddle_gpu,
 )
 
-from ...lora.test_lora_layers import create_lora_layers
+from ...lora.test_lora_layers_old_backend import create_lora_layers
 from ..pipeline_params import (
     TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
     TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,

From 805e34568708e0b730bfffb4a168e387eb074105 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Fri, 8 Dec 2023 09:46:25 +0800
Subject: [PATCH 17/19] Fix

---
 ppdiffusers/ppdiffusers/models/embeddings.py          | 4 ++--
 ppdiffusers/ppdiffusers/models/t5_film_transformer.py | 6 ++++--
 ppdiffusers/ppdiffusers/models/vq_model.py            | 2 ++
 ppdiffusers/tests/models/test_models_unet_motion.py   | 3 +++
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index ecc83b8e6..d6b766b60 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -311,10 +311,10 @@ def __init__(self, embed_dim: int, max_seq_length: int = 32):
         div_term = paddle.exp(paddle.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
         pe = paddle.zeros([1, max_seq_length, embed_dim])
         pe[0, :, 0::2] = paddle.sin(
-            paddle.to_tensor(position * div_term, dtype=paddle.get_default_dtype())
+            (position * div_term).cast(paddle.get_default_dtype())
         )  # paddle: sin not support int64, convert to float32
         pe[0, :, 1::2] = paddle.cos(
-            paddle.to_tensor(position * div_term, dtype=paddle.get_default_dtype())
+            (position * div_term).cast(paddle.get_default_dtype())
         )  # paddle: cos not support int64, convert to float32
         self.register_buffer("pe", pe)
 
diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
index 7409275b6..515538951 100644
--- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
@@ -92,7 +92,9 @@ def encoder_decoder_mask(self, query_input, key_input):
 
     def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
         batch, _, _ = decoder_input_tokens.shape
-        assert decoder_noise_time.shape == (batch,)
+        assert decoder_noise_time.shape == [
+            batch,
+        ]
 
         # decoder_noise_time is in [0, 1), so rescale to expected timing range.
         time_steps = get_timestep_embedding(
@@ -103,7 +105,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
 
         conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
 
-        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+        assert conditioning_emb.shape == [batch, 1, self.config.d_model * 4]
 
         seq_length = decoder_input_tokens.shape[1]
 
diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py
index ec67e6c75..68de2e271 100644
--- a/ppdiffusers/ppdiffusers/models/vq_model.py
+++ b/ppdiffusers/ppdiffusers/models/vq_model.py
@@ -135,6 +135,8 @@ def encode(self, x: paddle.Tensor, return_dict: bool = True):
     def decode(
         self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict: bool = True
     ) -> Union[DecoderOutput, paddle.Tensor]:
+        # cast h to float16 / float32
+        h = h.cast(self.dtype)
         # also go through quantization layer
         if not force_not_quantize:
             quant, emb_loss, info = self.quantize(h)
diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
index 11c730095..15e03f62d 100644
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -288,3 +288,6 @@ def test_forward_with_norm_groups(self):
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_from_save_pretrained_dtype(self):
+        pass

From 7fdcdb4337044193a0b424a67b5947400fc57abb Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Fri, 8 Dec 2023 10:37:33 +0800
Subject: [PATCH 18/19] Fix

---
 ppdiffusers/ppdiffusers/models/embeddings.py        | 5 ++++-
 ppdiffusers/tests/models/test_models_unet_motion.py | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index d6b766b60..4e4c605a3 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -316,7 +316,10 @@ def __init__(self, embed_dim: int, max_seq_length: int = 32):
         pe[0, :, 1::2] = paddle.cos(
             (position * div_term).cast(paddle.get_default_dtype())
         )  # paddle: cos not support int64, convert to float32
-        self.register_buffer("pe", pe)
+        # When use register_buffer, CI occur error in UNetMotionModelTests::test_from_save_pretrained_dtype
+        # self.register_buffer("pe", pe)
+        pe.stop_gradient = True
+        self.pe = pe
 
     def forward(self, x):
         _, seq_length, _ = x.shape
diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
index 15e03f62d..11c730095 100644
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ b/ppdiffusers/tests/models/test_models_unet_motion.py
@@ -288,6 +288,3 @@ def test_forward_with_norm_groups(self):
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_from_save_pretrained_dtype(self):
-        pass

From 5d2ea29e86739e3dc2f4c191593f20d09e4d26c6 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Fri, 8 Dec 2023 11:16:15 +0800
Subject: [PATCH 19/19] Fix

---
 ppdiffusers/ppdiffusers/models/embeddings.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index 4e4c605a3..bfc502690 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -311,10 +311,12 @@ def __init__(self, embed_dim: int, max_seq_length: int = 32):
         div_term = paddle.exp(paddle.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
         pe = paddle.zeros([1, max_seq_length, embed_dim])
         pe[0, :, 0::2] = paddle.sin(
-            (position * div_term).cast(paddle.get_default_dtype())
+            (position * div_term).cast(paddle.float32)  # use paddle.get_default_type(), CI occur error
+        ).cast(
+            pe.dtype
         )  # paddle: sin not support int64, convert to float32
-        pe[0, :, 1::2] = paddle.cos(
-            (position * div_term).cast(paddle.get_default_dtype())
+        pe[0, :, 1::2] = paddle.cos((position * div_term).cast(paddle.float32)).cast(
+            pe.dtype
         )  # paddle: cos not support int64, convert to float32
         # When use register_buffer, CI occur error in UNetMotionModelTests::test_from_save_pretrained_dtype
         # self.register_buffer("pe", pe)