Skip to content

Commit

Permalink
Resolve tests and tidy up
Browse files Browse the repository at this point in the history
  • Loading branch information
amyeroberts committed Aug 16, 2023
1 parent 59f15de commit 6298095
Show file tree
Hide file tree
Showing 28 changed files with 312 additions and 50 deletions.
4 changes: 1 addition & 3 deletions src/transformers/models/beit/image_processing_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,7 @@ def _preprocess_image(
# All transformations expect numpy arrays.
image = to_numpy_array(image)
if input_data_format is None:
input_data_format = infer_channel_dimension_format(
image, input_data_format=input_data_format, num_channels=num_channels
)
input_data_format = infer_channel_dimension_format(image, num_channels=num_channels)
image = self._preprocess(
image,
do_reduce_labels=False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_resize_output_image_size(
size_divisor: int = 32,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
input_height, input_width = get_image_size(input_image, input_data_format=input_data_format)
input_height, input_width = get_image_size(input_image, input_data_format)
min_size, max_size = shorter, longer

scale = min_size / min(input_height, input_width)
Expand Down Expand Up @@ -508,7 +508,7 @@ def preprocess(

if do_pad:
encoded_outputs = self.pad(
images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format
)
else:
encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def get_resize_output_image_size(
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format=input_data_format)
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size

Expand Down Expand Up @@ -1291,12 +1291,12 @@ def preprocess(
if annotations is not None:
resized_images, resized_annotations = [], []
for image, target in zip(images, annotations):
orig_size = get_image_size(image)
orig_size = get_image_size(image, input_data_format)
resized_image = self.resize(
image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
)
resized_annotation = self.resize_annotation(
target, orig_size, get_image_size(resized_image), input_data_format=input_data_format
target, orig_size, get_image_size(resized_image, input_data_format)
)
resized_images.append(resized_image)
resized_annotations.append(resized_annotation)
Expand All @@ -1318,7 +1318,9 @@ def preprocess(
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image), input_data_format=input_data_format)
self.normalize_annotation(
annotation, get_image_size(image, input_data_format), input_data_format=input_data_format
)
for annotation, image in zip(annotations, images)
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,14 @@ def resize(
resize_size = get_resize_output_image_size(
image, size=resize_shortest_edge, default_to_square=False, input_data_format=input_data_format
)
image = resize(image=image, size=resize_size, resample=resample, data_format=data_format, **kwargs)
image = resize(
image=image,
size=resize_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
# then crop to (shortest_edge, shortest_edge)
return center_crop(
image=image,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def get_resize_output_image_size(
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format=input_data_format)
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size

Expand Down Expand Up @@ -1289,12 +1289,12 @@ def preprocess(
if annotations is not None:
resized_images, resized_annotations = [], []
for image, target in zip(images, annotations):
orig_size = get_image_size(image)
orig_size = get_image_size(image, input_data_format)
resized_image = self.resize(
image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
)
resized_annotation = self.resize_annotation(
target, orig_size, get_image_size(resized_image), input_data_format=input_data_format
target, orig_size, get_image_size(resized_image, input_data_format)
)
resized_images.append(resized_image)
resized_annotations.append(resized_annotation)
Expand All @@ -1316,7 +1316,9 @@ def preprocess(
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image), input_data_format=input_data_format)
self.normalize_annotation(
annotation, get_image_size(image, input_data_format), input_data_format=input_data_format
)
for annotation, image in zip(annotations, images)
]

Expand Down
10 changes: 6 additions & 4 deletions src/transformers/models/deta/image_processing_deta.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def get_resize_output_image_size(
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format=input_data_format)
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size

Expand Down Expand Up @@ -953,11 +953,13 @@ def preprocess(
if annotations is not None:
resized_images, resized_annotations = [], []
for image, target in zip(images, annotations):
orig_size = get_image_size(image)
orig_size = get_image_size(image, input_data_format)
resized_image = self.resize(
image, size=size, resample=resample, input_data_format=input_data_format
)
resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
resized_annotation = self.resize_annotation(
target, orig_size, get_image_size(resized_image, input_data_format)
)
resized_images.append(resized_image)
resized_annotations.append(resized_annotation)
images = resized_images
Expand All @@ -978,7 +980,7 @@ def preprocess(
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image))
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]

Expand Down
10 changes: 6 additions & 4 deletions src/transformers/models/detr/image_processing_detr.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def get_resize_output_image_size(
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format=input_data_format)
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size

Expand Down Expand Up @@ -1261,12 +1261,12 @@ def preprocess(
if annotations is not None:
resized_images, resized_annotations = [], []
for image, target in zip(images, annotations):
orig_size = get_image_size(image)
orig_size = get_image_size(image, input_data_format)
resized_image = self.resize(
image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
)
resized_annotation = self.resize_annotation(
target, orig_size, get_image_size(resized_image), input_data_format=input_data_format
target, orig_size, get_image_size(resized_image, input_data_format)
)
resized_images.append(resized_image)
resized_annotations.append(resized_annotation)
Expand All @@ -1288,7 +1288,9 @@ def preprocess(
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image), input_data_format=input_data_format)
self.normalize_annotation(
annotation, get_image_size(image, input_data_format), input_data_format=input_data_format
)
for annotation, image in zip(annotations, images)
]

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/dpt/image_processing_dpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):

output_size = (output_size, output_size) if isinstance(output_size, int) else output_size

input_height, input_width = get_image_size(input_image, input_data_format=input_data_format)
input_height, input_width = get_image_size(input_image, input_data_format)
output_height, output_width = output_size

# determine new height and width
Expand Down Expand Up @@ -188,7 +188,7 @@ def resize(
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, input_data_format=input_data_format)
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
output_size = get_resize_output_image_size(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def preprocess(

if include_top:
images = [
self.normalize(image=image, mean=[0, 0, 0], std=image_std, input_data_format=input_data_format)
self.normalize(image=image, mean=0, std=image_std, input_data_format=input_data_format)
for image in images
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,17 @@ def normalize_box(box, width, height):
]


def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str] = None):
def apply_tesseract(
image: np.ndarray,
lang: Optional[str],
tesseract_config: Optional[str] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
tesseract_config = tesseract_config if tesseract_config is not None else ""

# apply OCR
pil_image = to_pil_image(image)
pil_image = to_pil_image(image, input_data_format=input_data_format)
image_width, image_height = pil_image.size
data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
Expand Down Expand Up @@ -260,7 +265,7 @@ def preprocess(
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
words_batch.append(words)
boxes_batch.append(boxes)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,16 @@ def normalize_box(box, width, height):
]


def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str]):
def apply_tesseract(
image: np.ndarray,
lang: Optional[str],
tesseract_config: Optional[str],
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""

# apply OCR
pil_image = to_pil_image(image)
pil_image = to_pil_image(image, input_data_format=input_data_format)
image_width, image_height = pil_image.size
data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
Expand Down Expand Up @@ -323,7 +328,7 @@ def preprocess(
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
words_batch.append(words)
boxes_batch.append(boxes)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ def _preprocess_mask(
do_resize: bool = None,
size: Dict[str, int] = None,
size_divisor: int = 0,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Preprocesses a single mask."""
segmentation_map = to_numpy_array(segmentation_map)
Expand Down Expand Up @@ -920,7 +921,9 @@ def encode_inputs(
if input_data_format is None:
input_data_format = infer_channel_dimension_format(pixel_values_list[0])

encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors)
encoded_inputs = self.pad(
pixel_values_list, return_tensors=return_tensors, input_data_format=input_data_format
)

if segmentation_maps is not None:
mask_labels = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def extract_flattened_patches(
image = torch.from_numpy(image)

patch_height, patch_width = patch_size["height"], patch_size["width"]
image_height, image_width = get_image_size(image, input_data_format=ChannelDimension.FIRST)
image_height, image_width = get_image_size(image, ChannelDimension.FIRST)

# maximize scale s.t.
scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
Expand Down
19 changes: 14 additions & 5 deletions src/transformers/models/sam/image_processing_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ def generate_crop_boxes(
points_per_crop: Optional[int] = 32,
crop_n_points_downscale_factor: Optional[List[int]] = 1,
device: Optional["torch.device"] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
return_tensors: str = "pt",
):
"""
Expand All @@ -590,6 +591,8 @@ def generate_crop_boxes(
The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
device (`torch.device`, *optional*, defaults to None):
Device to use for the computation. If None, cpu will be used.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
return_tensors (`str`, *optional*, defaults to `pt`):
If `pt`, returns `torch.Tensor`. If `tf`, returns `tf.Tensor`.
"""
Expand All @@ -600,6 +603,7 @@ def generate_crop_boxes(
overlap_ratio,
points_per_crop,
crop_n_points_downscale_factor,
input_data_format,
)
if return_tensors == "pt":
if device is None:
Expand Down Expand Up @@ -906,6 +910,7 @@ def _generate_crop_boxes(
overlap_ratio: float = 512 / 1500,
points_per_crop: Optional[int] = 32,
crop_n_points_downscale_factor: Optional[List[int]] = 1,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[List[List[int]], List[int]]:
"""
Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
Expand All @@ -925,12 +930,14 @@ def _generate_crop_boxes(
Number of points to sample per crop.
crop_n_points_downscale_factor (`int`, *optional*):
The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""

if isinstance(image, list):
raise ValueError("Only one image is allowed for crop generation.")
image = to_numpy_array(image)
original_size = get_image_size(image)
original_size = get_image_size(image, input_data_format)

points_grid = []
for i in range(crop_n_layers + 1):
Expand All @@ -940,7 +947,7 @@ def _generate_crop_boxes(
crop_boxes, layer_idxs = _generate_per_layer_crops(crop_n_layers, overlap_ratio, original_size)

cropped_images, point_grid_per_crop = _generate_crop_images(
crop_boxes, image, points_grid, layer_idxs, target_size, original_size
crop_boxes, image, points_grid, layer_idxs, target_size, original_size, input_data_format
)
crop_boxes = np.array(crop_boxes)
crop_boxes = crop_boxes.astype(np.float32)
Expand Down Expand Up @@ -986,7 +993,9 @@ def _generate_per_layer_crops(crop_n_layers, overlap_ratio, original_size):
return crop_boxes, layer_idxs


def _generate_crop_images(crop_boxes, image, points_grid, layer_idxs, target_size, original_size):
def _generate_crop_images(
crop_boxes, image, points_grid, layer_idxs, target_size, original_size, input_data_format=None
):
"""
Takes as an input bounding boxes that are used to crop the image. Based in the crops, the corresponding points are
also passed.
Expand All @@ -996,15 +1005,15 @@ def _generate_crop_images(crop_boxes, image, points_grid, layer_idxs, target_siz
for i, crop_box in enumerate(crop_boxes):
left, top, right, bottom = crop_box

channel_dim = infer_channel_dimension_format(image)
channel_dim = infer_channel_dimension_format(image, input_data_format)
if channel_dim == ChannelDimension.LAST:
cropped_im = image[top:bottom, left:right, :]
else:
cropped_im = image[:, top:bottom, left:right]

cropped_images.append(cropped_im)

cropped_im_size = get_image_size(cropped_im)
cropped_im_size = get_image_size(cropped_im, channel_dim)
points_scale = np.array(cropped_im_size)[None, ::-1]

points = points_grid[layer_idxs[i]] * points_scale
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def pad(
Returns:
`np.ndarray`: The padded image.
"""
old_height, old_width = get_image_size(image)
old_height, old_width = get_image_size(image, input_data_format)
pad_height = (old_height // size + 1) * size - old_height
pad_width = (old_width // size + 1) * size - old_width

Expand Down
Loading

0 comments on commit 6298095

Please sign in to comment.