Resolve tests and tidy up

huggingface · Aug 16, 2023 · 6298095 · 6298095
1 parent 59f15de
commit 6298095
Show file tree

Hide file tree

Showing 28 changed files with 312 additions and 50 deletions.
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
@@ -238,9 +238,7 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
         if input_data_format is None:
-            input_data_format = infer_channel_dimension_format(
-                image, input_data_format=input_data_format, num_channels=num_channels
-            )
+            input_data_format = infer_channel_dimension_format(image, num_channels=num_channels)
         image = self._preprocess(
             image,
             do_reduce_labels=False,

diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -95,7 +95,7 @@ def get_resize_output_image_size(
     size_divisor: int = 32,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
-    input_height, input_width = get_image_size(input_image, input_data_format=input_data_format)
+    input_height, input_width = get_image_size(input_image, input_data_format)
     min_size, max_size = shorter, longer
 
     scale = min_size / min(input_height, input_width)
@@ -508,7 +508,7 @@ def preprocess(
 
         if do_pad:
             encoded_outputs = self.pad(
-                images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
+                images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format
             )
         else:
             encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)

diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -144,7 +144,7 @@ def get_resize_output_image_size(
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image, input_data_format=input_data_format)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -1291,12 +1291,12 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
+                    orig_size = get_image_size(image, input_data_format)
                     resized_image = self.resize(
                         image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
                     )
                     resized_annotation = self.resize_annotation(
-                        target, orig_size, get_image_size(resized_image), input_data_format=input_data_format
+                        target, orig_size, get_image_size(resized_image, input_data_format)
                     )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
@@ -1318,7 +1318,9 @@ def preprocess(
             ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image), input_data_format=input_data_format)
+                    self.normalize_annotation(
+                        annotation, get_image_size(image, input_data_format), input_data_format=input_data_format
+                    )
                     for annotation, image in zip(annotations, images)
                 ]
 

diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
@@ -154,7 +154,14 @@ def resize(
             resize_size = get_resize_output_image_size(
                 image, size=resize_shortest_edge, default_to_square=False, input_data_format=input_data_format
             )
-            image = resize(image=image, size=resize_size, resample=resample, data_format=data_format, **kwargs)
+            image = resize(
+                image=image,
+                size=resize_size,
+                resample=resample,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                **kwargs,
+            )
             # then crop to (shortest_edge, shortest_edge)
             return center_crop(
                 image=image,

diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -143,7 +143,7 @@ def get_resize_output_image_size(
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image, input_data_format=input_data_format)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -1289,12 +1289,12 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
+                    orig_size = get_image_size(image, input_data_format)
                     resized_image = self.resize(
                         image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
                     )
                     resized_annotation = self.resize_annotation(
-                        target, orig_size, get_image_size(resized_image), input_data_format=input_data_format
+                        target, orig_size, get_image_size(resized_image, input_data_format)
                     )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
@@ -1316,7 +1316,9 @@ def preprocess(
             ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image), input_data_format=input_data_format)
+                    self.normalize_annotation(
+                        annotation, get_image_size(image, input_data_format), input_data_format=input_data_format
+                    )
                     for annotation, image in zip(annotations, images)
                 ]
 

diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
@@ -135,7 +135,7 @@ def get_resize_output_image_size(
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image, input_data_format=input_data_format)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -953,11 +953,13 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
+                    orig_size = get_image_size(image, input_data_format)
                     resized_image = self.resize(
                         image, size=size, resample=resample, input_data_format=input_data_format
                     )
-                    resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
                 images = resized_images
@@ -978,7 +980,7 @@ def preprocess(
             ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image))
+                    self.normalize_annotation(annotation, get_image_size(image, input_data_format))
                     for annotation, image in zip(annotations, images)
                 ]
 

diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
@@ -141,7 +141,7 @@ def get_resize_output_image_size(
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format of the input image. If not provided, it will be inferred from the input image.
     """
-    image_size = get_image_size(input_image, input_data_format=input_data_format)
+    image_size = get_image_size(input_image, input_data_format)
     if isinstance(size, (list, tuple)):
         return size
 
@@ -1261,12 +1261,12 @@ def preprocess(
             if annotations is not None:
                 resized_images, resized_annotations = [], []
                 for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
+                    orig_size = get_image_size(image, input_data_format)
                     resized_image = self.resize(
                         image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
                     )
                     resized_annotation = self.resize_annotation(
-                        target, orig_size, get_image_size(resized_image), input_data_format=input_data_format
+                        target, orig_size, get_image_size(resized_image, input_data_format)
                     )
                     resized_images.append(resized_image)
                     resized_annotations.append(resized_annotation)
@@ -1288,7 +1288,9 @@ def preprocess(
             ]
             if annotations is not None:
                 annotations = [
-                    self.normalize_annotation(annotation, get_image_size(image), input_data_format=input_data_format)
+                    self.normalize_annotation(
+                        annotation, get_image_size(image, input_data_format), input_data_format=input_data_format
+                    )
                     for annotation, image in zip(annotations, images)
                 ]
 

diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
@@ -68,7 +68,7 @@ def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
 
     output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
 
-    input_height, input_width = get_image_size(input_image, input_data_format=input_data_format)
+    input_height, input_width = get_image_size(input_image, input_data_format)
     output_height, output_width = output_size
 
     # determine new height and width
@@ -188,7 +188,7 @@ def resize(
             input_data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, input_data_format=input_data_format)
+        size = get_size_dict(size)
         if "height" not in size or "width" not in size:
             raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
         output_size = get_resize_output_image_size(

diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -350,7 +350,7 @@ def preprocess(
 
         if include_top:
             images = [
-                self.normalize(image=image, mean=[0, 0, 0], std=image_std, input_data_format=input_data_format)
+                self.normalize(image=image, mean=0, std=image_std, input_data_format=input_data_format)
                 for image in images
             ]
 

diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -51,12 +51,17 @@ def normalize_box(box, width, height):
     ]
 
 
-def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str] = None):
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
     """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
     tesseract_config = tesseract_config if tesseract_config is not None else ""
 
     # apply OCR
-    pil_image = to_pil_image(image)
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
     image_width, image_height = pil_image.size
     data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
     words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
@@ -260,7 +265,7 @@ def preprocess(
             words_batch = []
             boxes_batch = []
             for image in images:
-                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
                 words_batch.append(words)
                 boxes_batch.append(boxes)
 

diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -53,11 +53,16 @@ def normalize_box(box, width, height):
     ]
 
 
-def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str]):
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str],
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
     """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
 
     # apply OCR
-    pil_image = to_pil_image(image)
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
     image_width, image_height = pil_image.size
     data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
     words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
@@ -323,7 +328,7 @@ def preprocess(
             words_batch = []
             boxes_batch = []
             for image in images:
-                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
                 words_batch.append(words)
                 boxes_batch.append(boxes)
 

diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -632,6 +632,7 @@ def _preprocess_mask(
         do_resize: bool = None,
         size: Dict[str, int] = None,
         size_divisor: int = 0,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
         """Preprocesses a single mask."""
         segmentation_map = to_numpy_array(segmentation_map)
@@ -920,7 +921,9 @@ def encode_inputs(
         if input_data_format is None:
             input_data_format = infer_channel_dimension_format(pixel_values_list[0])
 
-        encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors)
+        encoded_inputs = self.pad(
+            pixel_values_list, return_tensors=return_tensors, input_data_format=input_data_format
+        )
 
         if segmentation_maps is not None:
             mask_labels = []

diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -269,7 +269,7 @@ def extract_flattened_patches(
         image = torch.from_numpy(image)
 
         patch_height, patch_width = patch_size["height"], patch_size["width"]
-        image_height, image_width = get_image_size(image, input_data_format=ChannelDimension.FIRST)
+        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
 
         # maximize scale s.t.
         scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))

diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
@@ -568,6 +568,7 @@ def generate_crop_boxes(
         points_per_crop: Optional[int] = 32,
         crop_n_points_downscale_factor: Optional[List[int]] = 1,
         device: Optional["torch.device"] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
         return_tensors: str = "pt",
     ):
         """
@@ -590,6 +591,8 @@ def generate_crop_boxes(
                 The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
             device (`torch.device`, *optional*, defaults to None):
                 Device to use for the computation. If None, cpu will be used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
             return_tensors (`str`, *optional*, defaults to `pt`):
                 If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
         """
@@ -600,6 +603,7 @@ def generate_crop_boxes(
             overlap_ratio,
             points_per_crop,
             crop_n_points_downscale_factor,
+            input_data_format,
         )
         if return_tensors == "pt":
             if device is None:
@@ -906,6 +910,7 @@ def _generate_crop_boxes(
     overlap_ratio: float = 512 / 1500,
     points_per_crop: Optional[int] = 32,
     crop_n_points_downscale_factor: Optional[List[int]] = 1,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[List[List[int]], List[int]]:
     """
     Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
@@ -925,12 +930,14 @@ def _generate_crop_boxes(
             Number of points to sample per crop.
         crop_n_points_downscale_factor (`int`, *optional*):
             The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred.
     """
 
     if isinstance(image, list):
         raise ValueError("Only one image is allowed for crop generation.")
     image = to_numpy_array(image)
-    original_size = get_image_size(image)
+    original_size = get_image_size(image, input_data_format)
 
     points_grid = []
     for i in range(crop_n_layers + 1):
@@ -940,7 +947,7 @@ def _generate_crop_boxes(
     crop_boxes, layer_idxs = _generate_per_layer_crops(crop_n_layers, overlap_ratio, original_size)
 
     cropped_images, point_grid_per_crop = _generate_crop_images(
-        crop_boxes, image, points_grid, layer_idxs, target_size, original_size
+        crop_boxes, image, points_grid, layer_idxs, target_size, original_size, input_data_format
     )
     crop_boxes = np.array(crop_boxes)
     crop_boxes = crop_boxes.astype(np.float32)
@@ -986,7 +993,9 @@ def _generate_per_layer_crops(crop_n_layers, overlap_ratio, original_size):
     return crop_boxes, layer_idxs
 
 
-def _generate_crop_images(crop_boxes, image, points_grid, layer_idxs, target_size, original_size):
+def _generate_crop_images(
+    crop_boxes, image, points_grid, layer_idxs, target_size, original_size, input_data_format=None
+):
     """
     Takes as an input bounding boxes that are used to crop the image. Based in the crops, the corresponding points are
     also passed.
@@ -996,15 +1005,15 @@ def _generate_crop_images(crop_boxes, image, points_grid, layer_idxs, target_siz
     for i, crop_box in enumerate(crop_boxes):
         left, top, right, bottom = crop_box
 
-        channel_dim = infer_channel_dimension_format(image)
+        channel_dim = infer_channel_dimension_format(image, input_data_format)
         if channel_dim == ChannelDimension.LAST:
             cropped_im = image[top:bottom, left:right, :]
         else:
             cropped_im = image[:, top:bottom, left:right]
 
         cropped_images.append(cropped_im)
 
-        cropped_im_size = get_image_size(cropped_im)
+        cropped_im_size = get_image_size(cropped_im, channel_dim)
         points_scale = np.array(cropped_im_size)[None, ::-1]
 
         points = points_grid[layer_idxs[i]] * points_scale

diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -93,7 +93,7 @@ def pad(
         Returns:
             `np.ndarray`: The padded image.
         """
-        old_height, old_width = get_image_size(image)
+        old_height, old_width = get_image_size(image, input_data_format)
         pad_height = (old_height // size + 1) * size - old_height
         pad_width = (old_width // size + 1) * size - old_width