diff --git a/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py b/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py index d3b4505f587..792c97114da 100644 --- a/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py +++ b/invokeai/backend/stable_diffusion/diffusion/regional_ip_data.py @@ -59,8 +59,11 @@ def _prepare_masks( if downscale_factor <= max_downscale_factor: # We use max pooling because we downscale to a pretty low resolution, so we don't want small mask # regions to be lost entirely. + # + # ceil_mode=True is set to mirror the downsampling behavior of SD and SDXL. + # # TODO(ryand): In the future, we may want to experiment with other downsampling methods. - mask_tensor = torch.nn.functional.max_pool2d(mask_tensor, kernel_size=2, stride=2) + mask_tensor = torch.nn.functional.max_pool2d(mask_tensor, kernel_size=2, stride=2, ceil_mode=True) return masks_by_seq_len diff --git a/invokeai/backend/stable_diffusion/diffusion/regional_prompt_data.py b/invokeai/backend/stable_diffusion/diffusion/regional_prompt_data.py index 85331013d5f..f09cc0a0d21 100644 --- a/invokeai/backend/stable_diffusion/diffusion/regional_prompt_data.py +++ b/invokeai/backend/stable_diffusion/diffusion/regional_prompt_data.py @@ -61,9 +61,12 @@ def _prepare_spatial_masks( if downscale_factor <= max_downscale_factor: # We use max pooling because we downscale to a pretty low resolution, so we don't want small prompt # regions to be lost entirely. + # + # ceil_mode=True is set to mirror the downsampling behavior of SD and SDXL. + # # TODO(ryand): In the future, we may want to experiment with other downsampling methods (e.g. # nearest interpolation), and could potentially use a weighted mask rather than a binary mask. - batch_sample_masks = F.max_pool2d(batch_sample_masks, kernel_size=2, stride=2) + batch_sample_masks = F.max_pool2d(batch_sample_masks, kernel_size=2, stride=2, ceil_mode=True) return batch_sample_masks_by_seq_len