diff --git a/darwin/importer/formats/pascalvoc.py b/darwin/importer/formats/pascalvoc.py
index dc70411b8..71ed4e92a 100644
--- a/darwin/importer/formats/pascalvoc.py
+++ b/darwin/importer/formats/pascalvoc.py
@@ -1,28 +1,49 @@
import xml.etree.ElementTree as ET
from pathlib import Path
-from typing import Optional
+from typing import List, NoReturn, Union
import darwin.datatypes as dt
-def parse_file(path: Path) -> Optional[dt.AnnotationFile]:
+def parse_file(path: Path) -> Union[dt.AnnotationFile, None, NoReturn]:
if path.suffix != ".xml":
- return
+ return None
- tree = ET.parse(path)
+ tree = ET.parse(str(path))
root = tree.getroot()
- filename = root.find("filename").text
- annotations = list(filter(None, map(_parse_annotation, root.findall("object"))))
+
+ filename = _find_text_value(root, "filename")
+
+ annotations: List[dt.Annotation] = list(filter(None, map(_parse_annotation, root.findall("object"))))
annotation_classes = set([annotation.annotation_class for annotation in annotations])
- return dt.AnnotationFile(path, filename, annotation_classes, annotations, remote_path = "/")
+ return dt.AnnotationFile(path, filename, annotation_classes, annotations, remote_path="/")
+
+
+# Private
+def _parse_annotation(annotation_object: ET.Element) -> Union[dt.Annotation, NoReturn]:
+ class_name = _find_text_value(annotation_object, "name")
-def _parse_annotation(annotation_object):
- class_name = annotation_object.find("name").text
- bndbox = annotation_object.find("bndbox")
- xmin = int(bndbox.find("xmin").text)
- xmax = int(bndbox.find("xmax").text)
- ymin = int(bndbox.find("ymin").text)
- ymax = int(bndbox.find("ymax").text)
+ bndbox = _find_element(annotation_object, "bndbox")
+ xmin = int(float(_find_text_value(bndbox, "xmin")))
+ xmax = int(float(_find_text_value(bndbox, "xmax")))
+ ymin = int(float(_find_text_value(bndbox, "ymin")))
+ ymax = int(float(_find_text_value(bndbox, "ymax")))
return dt.make_bounding_box(class_name, xmin, ymin, xmax - xmin, ymax - ymin)
+
+
+# Private
+def _find_element(source: ET.Element, name: str) -> Union[ET.Element, NoReturn]:
+ element = source.find(name)
+ if element is None:
+ raise ValueError(f"Could not find {name} element in annotation file")
+ return element
+
+
+# Private
+def _find_text_value(source: ET.Element, name: str) -> Union[str, NoReturn]:
+ element = _find_element(source, name)
+ if element is None or element.text is None:
+ raise ValueError(f"{name} element does not have a text value")
+ return element.text
diff --git a/darwin/importer/importer.py b/darwin/importer/importer.py
index cedbe0ad7..63ba5c5f0 100644
--- a/darwin/importer/importer.py
+++ b/darwin/importer/importer.py
@@ -1,5 +1,9 @@
from pathlib import Path
-from typing import Callable, List, Union
+from typing import TYPE_CHECKING, Callable, List, Tuple, Union
+
+if TYPE_CHECKING:
+ from darwin.client import Client
+ from darwin.dataset import RemoteDataset
import darwin.datatypes as dt
from darwin.utils import secure_continue_request
@@ -21,7 +25,7 @@ def build_main_annotations_lookup_table(annotation_classes):
def find_and_parse(
importer: Callable[[Path], Union[List[dt.AnnotationFile], dt.AnnotationFile, None]],
file_paths: List[Union[str, Path]],
-) -> (List[dt.AnnotationFile], List[dt.AnnotationFile]):
+) -> Tuple[List[dt.AnnotationFile], List[dt.AnnotationFile]]:
# TODO: this could be done in parallel
for file_path in map(Path, file_paths):
files = file_path.glob("**/*") if file_path.is_dir() else [file_path]
diff --git a/tests/darwin/importer/formats/pascalvoc_test.py b/tests/darwin/importer/formats/pascalvoc_test.py
new file mode 100644
index 000000000..64ce6e591
--- /dev/null
+++ b/tests/darwin/importer/formats/pascalvoc_test.py
@@ -0,0 +1,143 @@
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+import pytest
+from darwin.importer.formats.pascalvoc import parse_file
+
+
+def describe_parse_file():
+ @pytest.fixture
+ def annotation_path(tmp_path: Path):
+ path = tmp_path / "annotation.xml"
+ yield path
+ path.unlink()
+
+ def it_returns_none_if_path_suffix_is_not_xml():
+ path = Path("path/to/file.json")
+ assert parse_file(path) is None
+
+ def it_raises_file_not_found_error_if_file_does_not_exist():
+ path = Path("path/to/file.xml")
+
+ with pytest.raises(FileNotFoundError):
+ parse_file(path)
+
+ def it_raises_value_error_if_filename_tag_not_found(annotation_path: Path):
+ annotation_path.write_text("")
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find filename element in annotation file"
+
+ def it_returns_annotation_file_with_empty_annotations_otherwise(annotation_path: Path):
+ annotation_path.write_text("image.jpg")
+
+ annotation_file = parse_file(annotation_path)
+
+ assert annotation_file is not None
+ assert annotation_file.path == annotation_path
+ assert annotation_file.filename == "image.jpg"
+ assert not annotation_file.annotation_classes
+ assert not annotation_file.annotations
+ assert annotation_file.remote_path == "/"
+
+ def it_raises_if_name_tag_not_found_in_object(annotation_path: Path):
+ annotation_path.write_text("image.jpg")
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find name element in annotation file"
+
+ def it_raises_if_bndbox_tag_not_found_in_object(annotation_path: Path):
+ annotation_path.write_text("image.jpg")
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find bndbox element in annotation file"
+
+ def it_raises_if_xmin_tag_not_found_in_object(annotation_path: Path):
+ annotation_path.write_text(
+ "image.jpg"
+ )
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find xmin element in annotation file"
+
+ def it_raises_if_xmax_tag_not_found_in_object(annotation_path: Path):
+ annotation_path.write_text(
+ "image.jpg"
+ )
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find xmax element in annotation file"
+
+ def it_raises_if_ymin_tag_not_found_in_object(annotation_path: Path):
+ annotation_path.write_text(
+ "image.jpg"
+ )
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find ymin element in annotation file"
+
+ def it_raises_if_ymax_tag_not_found_in_object(annotation_path: Path):
+ annotation_path.write_text(
+ "image.jpg"
+ )
+
+ with pytest.raises(ValueError) as info:
+ parse_file(annotation_path)
+
+ assert str(info.value) == "Could not find ymax element in annotation file"
+
+ def it_returns_annotation_file_with_correct_annotations_otherwise(annotation_path: Path):
+ annotation_path.write_text(
+ "image.jpg"
+ )
+
+ annotation_file = parse_file(annotation_path)
+
+ assert annotation_file is not None
+ assert annotation_file.path == annotation_path
+ assert annotation_file.filename == "image.jpg"
+
+ class_ = annotation_file.annotation_classes.pop()
+ assert class_.name == "Class"
+ assert class_.annotation_type == "bounding_box"
+
+ annotation = annotation_file.annotations.pop()
+ assert annotation.annotation_class == class_
+ assert annotation.data == {"x": 10, "y": 10, "w": 0, "h": 0}
+ assert annotation.subs == []
+
+ assert annotation_file.remote_path == "/"
+
+ def it_returns_annotation_file_with_correct_annotations_with_float_values(annotation_path: Path):
+ annotation_path.write_text(
+ "image.jpg"
+ )
+
+ annotation_file = parse_file(annotation_path)
+
+ assert annotation_file is not None
+ assert annotation_file.path == annotation_path
+ assert annotation_file.filename == "image.jpg"
+
+ class_ = annotation_file.annotation_classes.pop()
+ assert class_.name == "Class"
+ assert class_.annotation_type == "bounding_box"
+
+ annotation = annotation_file.annotations.pop()
+ assert annotation.annotation_class == class_
+ assert annotation.data == {"x": 10, "y": 10, "w": 0, "h": 0}
+ assert annotation.subs == []
+
+ assert annotation_file.remote_path == "/"