Merge branch 'main' into add-DDIM-noise-comparative-analysis-pipeline

huggingface · Mar 14, 2023 · 18241d1 · 18241d1
2 parents d578650 + d9b8adc
commit 18241d1
Show file tree

Hide file tree

Showing 6 changed files with 392 additions and 36 deletions.
diff --git a/docs/source/en/using-diffusers/schedulers.mdx b/docs/source/en/using-diffusers/schedulers.mdx
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 # Schedulers
 
 Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize 
-a pipeline to one's use case. The best example of this are the [Schedulers](../api/schedulers/overview.mdx).
+a pipeline to one's use case. The best example of this is the [Schedulers](../api/schedulers/overview.mdx).
 
 Whereas diffusion models usually simply define the forward pass from noise to a less noisy sample, 
 schedulers define the whole denoising process, *i.e.*:
@@ -24,7 +24,7 @@ schedulers define the whole denoising process, *i.e.*:
 They can be quite complex and often define a trade-off between **denoising speed** and **denoising quality**.
 It is extremely difficult to measure quantitatively which scheduler works best for a given diffusion pipeline, so it is often recommended to simply try out which works best.
 
-The following paragraphs shows how to do so with the 🧨 Diffusers library.
+The following paragraphs show how to do so with the 🧨 Diffusers library.
 
 ## Load pipeline
 

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -271,9 +271,10 @@ def __init__(
     def forward(
         self,
         hidden_states,
+        attention_mask=None,
         encoder_hidden_states=None,
+        encoder_attention_mask=None,
         timestep=None,
-        attention_mask=None,
         cross_attention_kwargs=None,
         class_labels=None,
     ):
@@ -302,12 +303,14 @@ def forward(
             norm_hidden_states = (
                 self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
             )
+            # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+            # prepare attention mask here
 
             # 2. Cross-Attention
             attn_output = self.attn2(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
+                attention_mask=encoder_attention_mask,
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
@@ -389,6 +389,7 @@ def forward(
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
         controlnet_cond: torch.FloatTensor,
+        conditioning_scale: float = 1.0,
         class_labels: Optional[torch.Tensor] = None,
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -492,6 +493,10 @@ def forward(
 
         mid_block_res_sample = self.controlnet_mid_block(sample)
 
+        # 6. scaling
+        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+        mid_block_res_sample *= conditioning_scale
+
         if not return_dict:
             return (down_block_res_samples, mid_block_res_sample)