Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAR-2639][External] Add data & tests for in-platform model training #872

Merged
merged 10 commits into from
Jun 20, 2024
9 changes: 7 additions & 2 deletions darwin/dataset/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def _download_image_from_json_annotation(
parent_path,
annotation_path,
video_frames,
use_folders,
)
if force_slots:
return _download_all_slots_from_json_annotation(
Expand All @@ -235,6 +236,7 @@ def _download_image_from_json_annotation(
parent_path,
annotation_path,
video_frames,
use_folders,
)

return []
Expand Down Expand Up @@ -302,12 +304,15 @@ def _download_single_slot_from_json_annotation(
parent_path: Path,
annotation_path: Path,
video_frames: bool,
use_folders: bool = True,
) -> Iterable[Callable[[], None]]:
slot = annotation.slots[0]
generator = []

if video_frames and slot.type != "image":
video_path: Path = parent_path / annotation_path.stem
video_path: Path = parent_path / (
annotation_path.stem if not use_folders else Path(annotation.filename).stem
)
video_path.mkdir(exist_ok=True, parents=True)

# Indicates it's a long video and uses the segment and manifest
Expand Down Expand Up @@ -339,7 +344,7 @@ def _download_single_slot_from_json_annotation(
image_url = image["url"]
image_filename = image["file_name"]
suffix = Path(image_filename).suffix
stem = annotation_path.stem
stem = Path(annotation.filename).stem
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In cases where there are duplicate item names, the annotation_path.stem will have an _n appended, which will cause darwin-py to fail to find the corresponding item. Therefore, use the item name since this will not have a _n appended for uniqueness

filename = str(Path(stem + suffix))
image_path = parent_path / sanitize_filename(
filename or annotation.filename
Expand Down
94 changes: 48 additions & 46 deletions darwin/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,11 @@ def create_bbox_object(obj, box_mode, classes=None):
def get_annotations(
dataset_path: PathLike,
partition: Optional[str] = None,
split_type: Optional[str] = "random",
annotation_format: str = "coco",
split: Optional[str] = "default",
split_type: Optional[str] = None,
annotation_type: str = "polygon",
release_name: Optional[str] = None,
annotation_format: str = "coco",
ignore_inconsistent_examples: bool = False,
) -> Iterator[Dict[str, Any]]:
"""
Expand All @@ -415,17 +415,17 @@ def get_annotations(
dataset_path : PathLike
Path to the location of the dataset on the file system.
partition : Optional[str], default: None
Selects one of the partitions ``[train, val, test]``.
Selects one of the partitions ``[train, val, test, None]``. If not specified, all annotations are returned.
split_type : Optional[str], default: "random"
Heuristic used to do the split ``[random, stratified]``. If not specified, random is used.
annotation_format : str
Re-formatting of the annotation when loaded ``[coco, darwin]``..
split : Optional[str], default: "default"
Selects the split that defines the percentages used (use 'default' to select the default split).
split_type : Optional[str], default: None
Heuristic used to do the split ``[random, stratified, None]``.
annotation_type : str, default: "polygon"
The type of annotation classes ``[tag, bounding_box, polygon]``.
release_name : Optional[str], default: None
Version of the dataset.
annotation_format : Optional[str], default: "coco"
Re-formatting of the annotation when loaded ``[coco, darwin]``.
ignore_inconsistent_examples : bool, default: False
Ignore examples for which we have annotations, but either images are missing,
or more than one images exist for the same annotation.
Expand Down Expand Up @@ -467,18 +467,18 @@ def get_annotations(
)

if partition:
stems = _get_stems_from_split(
release_path, split, split_type, annotation_type, partition
annotation_filepaths = _get_annotation_filepaths_from_split(
release_path, annotation_type, partition, split_type, split=split
)
else:
stems = get_annotation_files_from_dir(annotations_dir)
annotation_filepaths = get_annotation_files_from_dir(annotations_dir)

(
images_paths,
annotations_paths,
invalid_annotation_paths,
) = _map_annotations_to_images(
stems, annotations_dir, images_dir, ignore_inconsistent_examples
annotation_filepaths, images_dir, ignore_inconsistent_examples
)

print(f"Found {len(invalid_annotation_paths)} invalid annotations")
Expand All @@ -505,55 +505,57 @@ def _validate_inputs(
Validates the input parameters for partition, split_type, and annotation_type.

Args:
partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None.
split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None.
partition (str, None): Dataset partition. Should be 'train', 'val', 'test', or None.
split_type (str, None): Type of dataset split. Can be 'random' or 'stratified'.
annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.

Raises:
ValueError: If the input parameters do not match the expected values.
"""
if partition not in ["train", "val", "test", None]:
raise ValueError("partition should be either 'train', 'val', 'test', or None")
if split_type not in ["random", "stratified", None]:
raise ValueError("split_type should be either 'random', 'stratified', or None")
raise ValueError("partition should be either 'train', 'val', 'test', or 'None'")
if split_type not in ["random", "stratified"]:
raise ValueError("split_type should be either 'random', or 'stratified'")
if annotation_type not in ["tag", "polygon", "bounding_box"]:
raise ValueError(
"annotation_type should be either 'tag', 'bounding_box', or 'polygon'"
)


def _get_stems_from_split(
def _get_annotation_filepaths_from_split(
release_path: Path,
split: str,
split_type: Union[str, None],
annotation_type: str,
partition: Union[str, None],
) -> Generator:
partition: str,
split_type: str,
split: Optional[str] = "default",
) -> Generator[str, None, None]:
"""
Determines the file stems based on the dataset split and other parameters.
Determines the filpaths based on the dataset split and other parameters.

Args:
release_path (Path): Path to the dataset release.
split (str): Dataset split identifier.
split_type (str, None): Type of dataset split. Can be 'random', 'stratified' or None.
annotation_type (str): Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
partition (str, None): Dataset partition. Should be 'train', 'val', 'test' or None.
release_path : Path
Path to the dataset release.
annotation_type : str
Type of annotations. Can be 'tag', 'polygon', or 'bounding_box'.
partition : str
Dataset partition. Should be 'train', 'val', 'test'.
split_type : str
Type of dataset split. Can be 'random' or 'stratified'.
split : Optional[str]
Dataset split identifier.

Returns:
Generator[str]: File stems for the dataset.
Generator: [str, None, None]
Filepaths for the dataset.

Raises:
ValueError: If the split_type is invalid.
FileNotFoundError: If the dataset partition file is not found.
"""
if split_type is None:
split_file = f"{partition}.txt"
elif split_type == "random":
if split_type == "random":
dorfmanrobert marked this conversation as resolved.
Show resolved Hide resolved
split_file = f"{split_type}_{partition}.txt"
elif split_type == "stratified":
split_file = f"{split_type}_{annotation_type}_{partition}.txt"
else:
raise ValueError(f"Invalid split_type ({split_type})")

split_path: Path = release_path / "lists" / str(split) / split_file

Expand All @@ -567,16 +569,15 @@ def _get_stems_from_split(


def _map_annotations_to_images(
stems: List[str],
annotations_dir: Path,
annotation_filepaths: Generator[str, None, None],
images_dir: Path,
ignore_inconsistent_examples: bool,
) -> Tuple[List[Path], List[Path], List[Path]]:
"""
Maps annotations to their corresponding images based on the file stems.

Args:
stems (List[str]): List of file stems.
annotation_filepaths (Generator[str, None, None]): List of annotation filepaths.
annotations_dir (Path): Directory containing annotation files.
images_dir (Path): Directory containing image files.
ignore_inconsistent_examples (bool): Flag to determine if inconsistent examples should be ignored.
Expand All @@ -591,14 +592,14 @@ def _map_annotations_to_images(
annotations_paths = []
invalid_annotation_paths = []
with_folders = any(item.is_dir() for item in images_dir.iterdir())
for annotation_path in get_annotation_files_from_dir(annotations_dir):
for annotation_path in annotation_filepaths:
darwin_json = stream_darwin_json(Path(annotation_path))
image_path = get_image_path_from_stream(
darwin_json, images_dir, Path(annotation_path), with_folders
)
if image_path.exists():
images_paths.append(image_path)
annotations_paths.append(annotation_path)
annotations_paths.append(Path(annotation_path))
continue
else:
if ignore_inconsistent_examples:
Expand All @@ -618,7 +619,7 @@ def _load_and_format_annotations(
annotation_format: str,
annotation_type: str,
classes: List[str],
) -> Generator:
) -> Generator[str, None, None]:
"""
Loads and formats annotations based on the specified format and type.

Expand Down Expand Up @@ -654,7 +655,7 @@ def _load_and_format_annotations(
)
elif annotation_format == "darwin":
for annotation_path in annotations_paths:
record = attempt_decode(annotation_path)
record = attempt_decode(Path(annotation_path))
yield record


Expand Down Expand Up @@ -795,16 +796,17 @@ def compute_distributions(
)
if not split_file.exists():
split_file = split_path / f"random_{partition}.txt"
stems: List[str] = [e.rstrip("\n\r") for e in split_file.open()]

for stem in stems:
if not stem.endswith(".json"):
stem = f"{stem}.json"
annotation_path: Path = annotations_dir / stem
annotation_filepaths: List[str] = [
e.rstrip("\n\r") for e in split_file.open()
]
for annotation_filepath in annotation_filepaths:
if not annotation_filepath.endswith(".json"):
annotation_filepath = f"{annotation_filepath}.json"
annotation_path: Path = annotations_dir / annotation_filepath
annotation_file: Optional[dt.AnnotationFile] = parse_path(
annotation_path
)

if annotation_file is None:
continue

Expand Down
29 changes: 29 additions & 0 deletions darwin/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,6 +1300,35 @@ def convert_polygons_to_sequences(
return sequences


def convert_xyxy_to_bounding_box(box: List[Union[int, float]]) -> dt.BoundingBox:
"""
Converts a list of xy coordinates representing a bounding box into a dictionary.
This is used by in-platform model training.

Parameters
----------
box : List[Union[int, float]]
List of arrays of coordinates in the format [x1, y1, x2, y2]

Returns
-------
BoundingBox
Bounding box in the format ``{x: x1, y: y1, h: height, w: width}``.

Raises
------
ValueError
If ``box`` has an incorrect format.
"""
if not isinstance(box[0], float) and not isinstance(box[0], int):
raise ValueError("Unknown input format")

x1, y1, x2, y2 = box
width = x2 - x1
height = y2 - y1
return {"x": x1, "y": y1, "w": width, "h": height}


def convert_polygons_to_mask(
polygons: List, height: int, width: int, value: Optional[int] = 1
) -> np.ndarray:
Expand Down
Loading
Loading