cvat-ai · SpecLad · Jun 8, 2023 · Oct 11, 2022 · Oct 11, 2022 · Oct 12, 2022
@@ -1,2 +1,4 @@
 cvat-sdk/docs/
 cvat-sdk/README.md
+.env/
+site/themes/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## \[2.5.0] - Unreleased
 ### Added
+- \[Server API\] An option to supply custom file ordering for task data uploads (<https://github.com/opencv/cvat/pull/5083>)
+
+### Changed
+- Allowed to use dataset manifest for the `predefined` sorting method for task data (<https://github.com/opencv/cvat/pull/5083>)
 - TBD
 
 ### Changed
@@ -25,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Deletion of uploaded files, including annotations and backups,
   after they have been uploaded to the server using the TUS protocol  but before an RQ job has been initiated. (<https://github.com/opencv/cvat/pull/5909>)
 - Simultaneous creation of tasks or projects with identical names from backups by multiple users.(<https://github.com/opencv/cvat/pull/5909>)
+- \[Server API\] The `predefined` sorting method for task data uploads (<https://github.com/opencv/cvat/pull/5083>)
 
 ### Security
 - TDB

@@ -1,4 +1,4 @@
-# Copyright (C) 2022 CVAT.ai Corporation
+# Copyright (C) 2022-2023 CVAT.ai Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -350,6 +350,10 @@ def upload_files(
         if pbar is not None:
             pbar.start(total_size, desc="Uploading data")
 
+        if str(kwargs.get("sorting_method")).lower() == "predefined":
+            # Request file ordering, because we reorder files to send more efficiently
+            kwargs.setdefault("upload_file_order", [p.name for p in resources])
+
         self._tus_start_upload(url)
 
         for group, group_size in bulk_file_groups:
@@ -374,7 +378,6 @@ def upload_files(
                 pbar.advance(group_size)
 
         for filename in separate_files:
-            # TODO: check if basename produces invalid paths here, can lead to overwriting
             self._upload_file_data_with_tus(
                 url,
                 filename,

@@ -381,12 +381,24 @@ def get_zip_filename(self):
 
     def get_path(self, i):
         if self._zip_source.filename:
-            return os.path.join(os.path.dirname(self._zip_source.filename), self._source_path[i]) \
-                if not self.extract_dir else os.path.join(self.extract_dir, self._source_path[i])
+            prefix = self._get_extract_prefix()
+            return os.path.join(prefix, self._source_path[i])
         else: # necessary for mime_type definition
             return self._source_path[i]
 
+    def __contains__(self, media_file):
+        return super().__contains__(os.path.relpath(media_file, self._get_extract_prefix()))
+
+    def _get_extract_prefix(self):
+        return self.extract_dir or os.path.dirname(self._zip_source.filename)
+
     def reconcile(self, source_files, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D, sorting_method=None):
+        if source_files:
+            # file list is expected to be a processed output of self.get_path()
+            # which returns files with the output directory prefix
+            prefix = self._get_extract_prefix()
+            source_files = [os.path.relpath(fn, prefix) for fn in source_files]
+
         super().reconcile(
             source_files=source_files,
             step=step,
@@ -397,7 +409,7 @@ def reconcile(self, source_files, step=1, start=0, stop=None, dimension=Dimensio
         )
 
     def extract(self):
-        self._zip_source.extractall(self.extract_dir if self.extract_dir else os.path.dirname(self._zip_source.filename))
+        self._zip_source.extractall(self._get_extract_prefix())
         if not self.extract_dir:
             os.remove(self._zip_source.filename)
 

@@ -0,0 +1,37 @@
+# Generated by Django 3.2.18 on 2023-04-18 09:01
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('engine', '0067_alter_cloudstorage_credentials_type'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='clientfile',
+            options={'default_permissions': (), 'ordering': ('id',)},
+        ),
+        migrations.AlterModelOptions(
+            name='relatedfile',
+            options={'default_permissions': (), 'ordering': ('id',)},
+        ),
+        migrations.AlterModelOptions(
+            name='remotefile',
+            options={'default_permissions': (), 'ordering': ('id',)},
+        ),
+        migrations.AlterModelOptions(
+            name='serverfile',
+            options={'default_permissions': (), 'ordering': ('id',)},
+        ),
+        migrations.AlterUniqueTogether(
+            name='remotefile',
+            unique_together={('data', 'file')},
+        ),
+        migrations.AlterUniqueTogether(
+            name='serverfile',
+            unique_together={('data', 'file')},
+        ),
+    ]
@@ -6,6 +6,7 @@
 import base64
 import json
 import os
+import os.path
 import uuid
 from dataclasses import asdict, dataclass
 from distutils.util import strtobool
@@ -151,9 +152,30 @@ def __init__(self, request):
         self.size = int(request.META.get("CONTENT_LENGTH", settings.TUS_DEFAULT_CHUNK_SIZE))
         self.content = request.body
 
-# This upload mixin is implemented using tus
-# tus is open protocol for file uploads (see more https://tus.io/)
 class UploadMixin:
+    """
+    Implements file uploads to the server. Allows to upload single and multiple files, suspend
+    and resume uploading. Uses the TUS open file uploading protocol (https://tus.io/).
+
+    Implements the following protocols:
+    a. A single Data request
+
+    and
+
+    b.1. An Upload-Start request
+    b.2.a. The regular TUS protocol requests (Upload-Length + Chunks)
+    b.2.b. Upload-Multiple requests
+    b.3. An Upload-Finish request
+
+    Requests:
+    - Data - POST, no extra headers or 'Upload-Start' + 'Upload-Finish' headers
+    - Upload-Start - POST, has an 'Upload-Start' header
+    - Upload-Length - POST, has an 'Upload-Length' header (read the TUS protocol)
+    - Chunk - HEAD/PATCH (read the TUS protocol)
+    - Upload-Finish - POST, has an 'Upload-Finish' header
+    - Upload-Multiple - POST, has a 'Upload-Multiple' header
+    """
+
     _tus_api_version = '1.0.0'
     _tus_api_version_supported = ['1.0.0']
     _tus_api_extensions = []
@@ -204,11 +226,11 @@ def upload_data(self, request):
         if one_request_upload or finish_upload:
             return self.upload_finished(request)
         elif start_upload:
-            return Response(status=status.HTTP_202_ACCEPTED)
+            return self.upload_started(request)
         elif tus_request:
             return self.init_tus_upload(request)
         elif bulk_file_upload:
-            return self.append(request)
+            return self.append_files(request)
         else: # backward compatibility case - no upload headers were found
             return self.upload_finished(request)
 
@@ -218,7 +240,7 @@ def init_tus_upload(self, request):
         else:
             metadata = self._get_metadata(request)
             filename = metadata.get('filename', '')
-            if not self.validate_filename(filename):
+            if not self.is_valid_uploaded_file_name(filename):
                 return self._tus_response(status=status.HTTP_400_BAD_REQUEST,
                     data="File name {} is not allowed".format(filename))
 
@@ -310,32 +332,55 @@ def append_tus_chunk(self, request, file_id):
                                     extra_headers={'Upload-Offset': tus_file.offset,
                                                     'Upload-Filename': tus_file.filename})
 
-    def validate_filename(self, filename):
+    def is_valid_uploaded_file_name(self, filename: str) -> bool:
+        """
+        Checks the file name to be valid.
+        Returns True if the filename is valid, otherwise returns False.
+        """
+
         upload_dir = self.get_upload_dir()
         file_path = os.path.join(upload_dir, filename)
         return os.path.commonprefix((os.path.realpath(file_path), upload_dir)) == upload_dir
 
-    def get_upload_dir(self):
+    def get_upload_dir(self) -> str:
         return self._object.data.get_upload_dirname()
 
-    def get_request_client_files(self, request):
+    def _get_request_client_files(self, request):
         serializer = DataSerializer(self._object, data=request.data)
         serializer.is_valid(raise_exception=True)
-        data = {k: v for k, v in serializer.validated_data.items()}
-        return data.get('client_files', None)
+        return serializer.validated_data.get('client_files')
 
-    def append(self, request):
-        client_files = self.get_request_client_files(request)
+    def append_files(self, request):
+        """
+        Processes a single or multiple files sent in a single request inside
+        a file uploading session.
+        """
+
+        client_files = self._get_request_client_files(request)
         if client_files:
             upload_dir = self.get_upload_dir()
             for client_file in client_files:
-                with open(os.path.join(upload_dir, client_file['file'].name), 'ab+') as destination:
+                filename = client_file['file'].name
+                if not self.is_valid_uploaded_file_name(filename):
+                    return Response(status=status.HTTP_400_BAD_REQUEST,
+                        data=f"File name {filename} is not allowed", content_type="text/plain")
+
+                with open(os.path.join(upload_dir, filename), 'ab+') as destination:
                     destination.write(client_file['file'].read())
         return Response(status=status.HTTP_200_OK)
 
-    # override this to do stuff after upload
+    def upload_started(self, request):
+        """
+        Allows to do actions before upcoming file uploading.
+        """
+        return Response(status=status.HTTP_202_ACCEPTED)
+
     def upload_finished(self, request):
-        raise NotImplementedError('You need to implement upload_finished in UploadMixin')
+        """
+        Allows to process uploaded files.
+        """
+
+        raise NotImplementedError('Must be implemented in the derived class')
 
 class AnnotationMixin:
     def export_annotations(self, request, db_obj, export_func, callback, get_data=None):

@@ -264,11 +264,6 @@ def make_dirs(self):
         os.makedirs(self.get_original_cache_dirname())
         os.makedirs(self.get_upload_dirname())
 
-    def get_uploaded_files(self):
-        upload_dir = self.get_upload_dirname()
-        uploaded_files = [os.path.join(upload_dir, file) for file in os.listdir(upload_dir) if os.path.isfile(os.path.join(upload_dir, file))]
-        represented_files = [{'file':f} for f in uploaded_files]
-        return represented_files
 
 class Video(models.Model):
     data = models.OneToOneField(Data, on_delete=models.CASCADE, related_name="video", null=True)
@@ -424,13 +419,22 @@ class Meta:
         default_permissions = ()
         unique_together = ("data", "file")
 
+        # Some DBs can shuffle the rows. Here we restore the insertion order.
+        # https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
+        ordering = ('id', )
+
 # For server files on the mounted share
 class ServerFile(models.Model):
     data = models.ForeignKey(Data, on_delete=models.CASCADE, null=True, related_name='server_files')
     file = models.CharField(max_length=1024)
 
     class Meta:
         default_permissions = ()
+        unique_together = ("data", "file")
+
+        # Some DBs can shuffle the rows. Here we restore the insertion order.
+        # https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
+        ordering = ('id', )
 
 # For URLs
 class RemoteFile(models.Model):
@@ -439,6 +443,11 @@ class RemoteFile(models.Model):
 
     class Meta:
         default_permissions = ()
+        unique_together = ("data", "file")
+
+        # Some DBs can shuffle the rows. Here we restore the insertion order.
+        # https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
+        ordering = ('id', )
 
 
 class RelatedFile(models.Model):
@@ -451,6 +460,10 @@ class Meta:
         default_permissions = ()
         unique_together = ("data", "path")
 
+        # Some DBs can shuffle the rows. Here we restore the insertion order.
+        # https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
+        ordering = ('id', )
+
 class Segment(models.Model):
     task = models.ForeignKey(Task, on_delete=models.CASCADE)
     start_frame = models.IntegerField()

@@ -763,12 +763,33 @@ class DataSerializer(serializers.ModelSerializer):
         """))
     job_file_mapping = JobFileMapping(required=False, write_only=True)
 
+    upload_file_order = serializers.ListField(
+        child=serializers.CharField(max_length=1024),
+        default=list, allow_empty=True, write_only=True,
+        help_text=textwrap.dedent("""\
+            Allows to specify file order for client_file uploads.
+            Only valid with the "{}" sorting method selected.
+
+            To state that the input files are sent in the correct order,
+            pass an empty list.
+
+            If you want to send files in an arbitrary order
+            and reorder them afterwards on the server,
+            pass the list of file names in the required order.
+        """.format(models.SortingMethod.PREDEFINED))
+    )
+
     class Meta:
         model = models.Data
-        fields = ('chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter',
-            'compressed_chunk_type', 'original_chunk_type', 'client_files', 'server_files', 'server_files_exclude','remote_files', 'use_zip_chunks',
-            'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method', 'filename_pattern',
-            'job_file_mapping')
+        fields = (
+            'chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter',
+            'compressed_chunk_type', 'original_chunk_type',
+            'client_files', 'server_files', 'remote_files',
+            'use_zip_chunks', 'server_files_exclude',
+            'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method',
+            'storage', 'sorting_method', 'filename_pattern',
+            'job_file_mapping', 'upload_file_order',
+        )
         extra_kwargs = {
             'chunk_size': { 'help_text': "Maximum number of frames per chunk" },
             'size': { 'help_text': "The number of frames" },
@@ -858,8 +879,9 @@ def _pop_data(self, validated_data):
         server_files = validated_data.pop('server_files')
         remote_files = validated_data.pop('remote_files')
 
-        validated_data.pop('job_file_mapping', None) # optional
-        validated_data.pop('server_files_exclude', None) # optional
+        validated_data.pop('job_file_mapping', None) # optional, not present in Data
+        validated_data.pop('upload_file_order', None) # optional, not present in Data
+        validated_data.pop('server_files_exclude', None) # optional, not present in Data
 
         for extra_key in { 'use_zip_chunks', 'use_cache', 'copy_data' }:
             validated_data.pop(extra_key)