Use subset name on task export (#8176)

### Motivation and context  Fixes #6451 ### How has this been tested?  ### Checklist  - [ ] I submit my changes into the `develop` branch - [ ] I have created a changelog fragment  - [ ] I have updated the documentation accordingly - [ ] I have added tests to cover my changes - [ ] I have linked related issues (see [GitHub docs]( https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)) - [ ] I have increased versions of npm packages if it is necessary ([cvat-canvas](https://github.com/cvat-ai/cvat/tree/develop/cvat-canvas#versioning), [cvat-core](https://github.com/cvat-ai/cvat/tree/develop/cvat-core#versioning), [cvat-data](https://github.com/cvat-ai/cvat/tree/develop/cvat-data#versioning) and [cvat-ui](https://github.com/cvat-ai/cvat/tree/develop/cvat-ui#versioning)) ### License - [ ] I submit _my code changes_ under the same [MIT License]( https://github.com/cvat-ai/cvat/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern.  ## Summary by CodeRabbit - **New Features** - Enhanced task export functionality: Images are now organized into folders based on subsets for better data organization. - **Tests** - Introduced new test methods and updated existing tests to validate the new subset-based export functionality. - Added new data entries for improved test coverage and verification.  --------- Co-authored-by: Maxim Zhiltsov <zhiltsov.max35@gmail.com>
cvat-ai · Jul 17, 2024 · 2490153 · 2490153
1 parent 49c39ef
commit 2490153
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 5 deletions.
diff --git a/changelog.d/20240716_160243_dlavrukhin_apply_subset_name_on_task_export.md b/changelog.d/20240716_160243_dlavrukhin_apply_subset_name_on_task_export.md
@@ -0,0 +1,4 @@
+### Changed
+
+- On task export, put images to folders depending on subset
+  (<https://github.com/cvat-ai/cvat/pull/8176>)
diff --git a/cvat/apps/dataset_manager/bindings.py b/cvat/apps/dataset_manager/bindings.py
@@ -200,7 +200,7 @@ class CommonData(InstanceLabelData):
     Tag = namedtuple('Tag', 'frame, label, attributes, source, group, id')
     Tag.__new__.__defaults__ = (0, None)
     Frame = namedtuple(
-        'Frame', 'idx, id, frame, name, width, height, labeled_shapes, tags, shapes, labels')
+        'Frame', 'idx, id, frame, name, width, height, labeled_shapes, tags, shapes, labels, subset')
     Label = namedtuple('Label', 'id, name, color, type')
 
     def __init__(self,
@@ -223,6 +223,7 @@ def __init__(self,
         self._db_data = db_task.data
         self._use_server_track_ids = use_server_track_ids
         self._required_frames = included_frames
+        self._db_subset = db_task.subset
 
         super().__init__(db_task)
 
@@ -268,6 +269,7 @@ def _init_frame_info(self):
                     "path": "frame_{:06d}".format(self.abs_frame_id(frame)),
                     "width": self._db_data.video.width,
                     "height": self._db_data.video.height,
+                    "subset": self._db_subset,
                 } for frame in self.rel_range
             }
         else:
@@ -278,6 +280,7 @@ def _init_frame_info(self):
                     "path": db_image.path,
                     "width": db_image.width,
                     "height": db_image.height,
+                    "subset": self._db_subset,
                 } for db_image in queryset
             }
 
@@ -409,6 +412,7 @@ def get_frame(idx):
                 frames[frame] = CommonData.Frame(
                     idx=idx,
                     id=frame_info.get("id", 0),
+                    subset=frame_info["subset"],
                     frame=frame,
                     name=frame_info["path"],
                     height=frame_info["height"],
@@ -1487,12 +1491,14 @@ def __init__(
         dimension: DimensionType = DimensionType.DIM_2D,
         **kwargs
     ):
+        instance_meta = instance_data.meta[instance_data.META_FIELD]
         dm.SourceExtractor.__init__(
-            self, media_type=dm.Image if dimension == DimensionType.DIM_2D else PointCloud
+            self,
+            media_type=dm.Image if dimension == DimensionType.DIM_2D else PointCloud,
+            subset=instance_meta['subset'],
         )
         CVATDataExtractorMixin.__init__(self, **kwargs)
 
-        instance_meta = instance_data.meta[instance_data.META_FIELD]
         self._categories = self._load_categories(instance_meta['labels'])
         self._user = self._load_user_info(instance_meta) if dimension == DimensionType.DIM_3D else {}
         self._dimension = dimension
@@ -1527,6 +1533,7 @@ def __init__(
                 dm_item = dm.DatasetItem(
                         id=osp.splitext(frame_data.name)[0],
                         annotations=dm_anno, media=dm_image,
+                        subset=frame_data.subset,
                         attributes={'frame': frame_data.frame
                     })
             elif dimension == DimensionType.DIM_3D:
@@ -1543,7 +1550,7 @@ def __init__(
                 dm_item = dm.DatasetItem(
                     id=osp.splitext(osp.split(frame_data.name)[-1])[0],
                     annotations=dm_anno, media=PointCloud(dm_image[0]), related_images=dm_image[1],
-                    attributes=attributes
+                    attributes=attributes, subset=frame_data.subset,
                 )
 
             dm_items.append(dm_item)

diff --git a/cvat/apps/quality_control/quality_reports.py b/cvat/apps/quality_control/quality_reports.py
@@ -1701,7 +1701,7 @@ def _find_gt_conflicts(self):
         gt_job_dataset = self._gt_dataset
 
         for gt_item in gt_job_dataset:
-            ds_item = ds_job_dataset.get(gt_item.id)
+            ds_item = ds_job_dataset.get(id=gt_item.id, subset=gt_item.subset)
             if not ds_item:
                 continue  # we need to compare only intersecting frames
 

diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 import io
+import itertools
 import json
 import os
 import os.path as osp
@@ -13,6 +14,7 @@
 from http import HTTPStatus
 from itertools import chain, product
 from math import ceil
+from operator import itemgetter
 from pathlib import Path
 from tempfile import NamedTemporaryFile, TemporaryDirectory
 from time import sleep, time
@@ -807,6 +809,32 @@ def test_export_dataset_after_deleting_related_cloud_storage(self, admin_user, t
             response = export_dataset(api_client.tasks_api.retrieve_dataset_endpoint, id=task["id"])
             assert response.data
 
+    @pytest.mark.parametrize(
+        "export_format, default_subset_name, subset_path_template",
+        [
+            ("Datumaro 1.0", "", "images/{subset}"),
+            ("YOLO 1.1", "train", "obj_{subset}_data"),
+        ],
+    )
+    def test_uses_subset_name(
+        self, tasks, admin_user, export_format, default_subset_name, subset_path_template
+    ):
+        group_key_func = itemgetter("subset")
+        subsets_and_tasks = [
+            (subset, next(group))
+            for subset, group in itertools.groupby(
+                sorted(tasks, key=group_key_func),
+                key=group_key_func,
+            )
+        ]
+        for subset_name, task in subsets_and_tasks:
+            response = self._test_export_task(admin_user, tid=task["id"], format=export_format)
+            with zipfile.ZipFile(io.BytesIO(response.data)) as zip_file:
+                subset_path = subset_path_template.format(subset=subset_name or default_subset_name)
+                assert any(
+                    subset_path in path for path in zip_file.namelist()
+                ), f"No {subset_path} in {zip_file.namelist()}"
+
 
 @pytest.mark.usefixtures("restore_db_per_function")
 @pytest.mark.usefixtures("restore_cvat_data")