diff --git a/darwin/importer/formats/pascalvoc.py b/darwin/importer/formats/pascalvoc.py index dc70411b8..71ed4e92a 100644 --- a/darwin/importer/formats/pascalvoc.py +++ b/darwin/importer/formats/pascalvoc.py @@ -1,28 +1,49 @@ import xml.etree.ElementTree as ET from pathlib import Path -from typing import Optional +from typing import List, NoReturn, Union import darwin.datatypes as dt -def parse_file(path: Path) -> Optional[dt.AnnotationFile]: +def parse_file(path: Path) -> Union[dt.AnnotationFile, None, NoReturn]: if path.suffix != ".xml": - return + return None - tree = ET.parse(path) + tree = ET.parse(str(path)) root = tree.getroot() - filename = root.find("filename").text - annotations = list(filter(None, map(_parse_annotation, root.findall("object")))) + + filename = _find_text_value(root, "filename") + + annotations: List[dt.Annotation] = list(filter(None, map(_parse_annotation, root.findall("object")))) annotation_classes = set([annotation.annotation_class for annotation in annotations]) - return dt.AnnotationFile(path, filename, annotation_classes, annotations, remote_path = "/") + return dt.AnnotationFile(path, filename, annotation_classes, annotations, remote_path="/") + + +# Private +def _parse_annotation(annotation_object: ET.Element) -> Union[dt.Annotation, NoReturn]: + class_name = _find_text_value(annotation_object, "name") -def _parse_annotation(annotation_object): - class_name = annotation_object.find("name").text - bndbox = annotation_object.find("bndbox") - xmin = int(bndbox.find("xmin").text) - xmax = int(bndbox.find("xmax").text) - ymin = int(bndbox.find("ymin").text) - ymax = int(bndbox.find("ymax").text) + bndbox = _find_element(annotation_object, "bndbox") + xmin = int(float(_find_text_value(bndbox, "xmin"))) + xmax = int(float(_find_text_value(bndbox, "xmax"))) + ymin = int(float(_find_text_value(bndbox, "ymin"))) + ymax = int(float(_find_text_value(bndbox, "ymax"))) return dt.make_bounding_box(class_name, xmin, ymin, xmax - xmin, ymax - ymin) + + +# Private +def _find_element(source: ET.Element, name: str) -> Union[ET.Element, NoReturn]: + element = source.find(name) + if element is None: + raise ValueError(f"Could not find {name} element in annotation file") + return element + + +# Private +def _find_text_value(source: ET.Element, name: str) -> Union[str, NoReturn]: + element = _find_element(source, name) + if element is None or element.text is None: + raise ValueError(f"{name} element does not have a text value") + return element.text diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py index cedbe0ad7..63ba5c5f0 100644 --- a/darwin/importer/importer.py +++ b/darwin/importer/importer.py @@ -1,5 +1,9 @@ from pathlib import Path -from typing import Callable, List, Union +from typing import TYPE_CHECKING, Callable, List, Tuple, Union + +if TYPE_CHECKING: + from darwin.client import Client + from darwin.dataset import RemoteDataset import darwin.datatypes as dt from darwin.utils import secure_continue_request @@ -21,7 +25,7 @@ def build_main_annotations_lookup_table(annotation_classes): def find_and_parse( importer: Callable[[Path], Union[List[dt.AnnotationFile], dt.AnnotationFile, None]], file_paths: List[Union[str, Path]], -) -> (List[dt.AnnotationFile], List[dt.AnnotationFile]): +) -> Tuple[List[dt.AnnotationFile], List[dt.AnnotationFile]]: # TODO: this could be done in parallel for file_path in map(Path, file_paths): files = file_path.glob("**/*") if file_path.is_dir() else [file_path] diff --git a/tests/darwin/importer/formats/pascalvoc_test.py b/tests/darwin/importer/formats/pascalvoc_test.py new file mode 100644 index 000000000..64ce6e591 --- /dev/null +++ b/tests/darwin/importer/formats/pascalvoc_test.py @@ -0,0 +1,143 @@ +import xml.etree.ElementTree as ET +from pathlib import Path + +import pytest +from darwin.importer.formats.pascalvoc import parse_file + + +def describe_parse_file(): + @pytest.fixture + def annotation_path(tmp_path: Path): + path = tmp_path / "annotation.xml" + yield path + path.unlink() + + def it_returns_none_if_path_suffix_is_not_xml(): + path = Path("path/to/file.json") + assert parse_file(path) is None + + def it_raises_file_not_found_error_if_file_does_not_exist(): + path = Path("path/to/file.xml") + + with pytest.raises(FileNotFoundError): + parse_file(path) + + def it_raises_value_error_if_filename_tag_not_found(annotation_path: Path): + annotation_path.write_text("") + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find filename element in annotation file" + + def it_returns_annotation_file_with_empty_annotations_otherwise(annotation_path: Path): + annotation_path.write_text("image.jpg") + + annotation_file = parse_file(annotation_path) + + assert annotation_file is not None + assert annotation_file.path == annotation_path + assert annotation_file.filename == "image.jpg" + assert not annotation_file.annotation_classes + assert not annotation_file.annotations + assert annotation_file.remote_path == "/" + + def it_raises_if_name_tag_not_found_in_object(annotation_path: Path): + annotation_path.write_text("image.jpg") + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find name element in annotation file" + + def it_raises_if_bndbox_tag_not_found_in_object(annotation_path: Path): + annotation_path.write_text("image.jpgClass") + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find bndbox element in annotation file" + + def it_raises_if_xmin_tag_not_found_in_object(annotation_path: Path): + annotation_path.write_text( + "image.jpgClass" + ) + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find xmin element in annotation file" + + def it_raises_if_xmax_tag_not_found_in_object(annotation_path: Path): + annotation_path.write_text( + "image.jpgClass10" + ) + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find xmax element in annotation file" + + def it_raises_if_ymin_tag_not_found_in_object(annotation_path: Path): + annotation_path.write_text( + "image.jpgClass1010" + ) + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find ymin element in annotation file" + + def it_raises_if_ymax_tag_not_found_in_object(annotation_path: Path): + annotation_path.write_text( + "image.jpgClass101010" + ) + + with pytest.raises(ValueError) as info: + parse_file(annotation_path) + + assert str(info.value) == "Could not find ymax element in annotation file" + + def it_returns_annotation_file_with_correct_annotations_otherwise(annotation_path: Path): + annotation_path.write_text( + "image.jpgClass10101010" + ) + + annotation_file = parse_file(annotation_path) + + assert annotation_file is not None + assert annotation_file.path == annotation_path + assert annotation_file.filename == "image.jpg" + + class_ = annotation_file.annotation_classes.pop() + assert class_.name == "Class" + assert class_.annotation_type == "bounding_box" + + annotation = annotation_file.annotations.pop() + assert annotation.annotation_class == class_ + assert annotation.data == {"x": 10, "y": 10, "w": 0, "h": 0} + assert annotation.subs == [] + + assert annotation_file.remote_path == "/" + + def it_returns_annotation_file_with_correct_annotations_with_float_values(annotation_path: Path): + annotation_path.write_text( + "image.jpgClass10.010.010.010.0" + ) + + annotation_file = parse_file(annotation_path) + + assert annotation_file is not None + assert annotation_file.path == annotation_path + assert annotation_file.filename == "image.jpg" + + class_ = annotation_file.annotation_classes.pop() + assert class_.name == "Class" + assert class_.annotation_type == "bounding_box" + + annotation = annotation_file.annotations.pop() + assert annotation.annotation_class == class_ + assert annotation.data == {"x": 10, "y": 10, "w": 0, "h": 0} + assert annotation.subs == [] + + assert annotation_file.remote_path == "/"