Skip to content

Commit

Permalink
Fix issue: Original pdf file is deleted (#3967)
Browse files Browse the repository at this point in the history
  • Loading branch information
Marishka17 authored Dec 16, 2021
1 parent 73a0c64 commit cde33ac
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 18 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed task creating with large files via webpage (<https://github.com/openvinotoolkit/cvat/pull/3692>)
- Added information to export CVAT_HOST when performing local installation for accessing over network (<https://github.com/openvinotoolkit/cvat/pull/4014>)
- Fixed possible color collisions in the generated colormap (<https://github.com/openvinotoolkit/cvat/pull/4007>)
- Original pdf file is deleted when using share(<https://github.com/openvinotoolkit/cvat/pull/3967>)

### Security
- TDB
Expand Down
25 changes: 15 additions & 10 deletions cvat/apps/engine/media_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,16 @@ def __init__(self,
start=0,
stop=None,
dimension=DimensionType.DIM_2D,
sorting_method=SortingMethod.LEXICOGRAPHICAL):
sorting_method=SortingMethod.LEXICOGRAPHICAL,
extract_dir=None):

self._archive_source = source_path[0]
extract_dir = source_path[1] if len(source_path) > 1 else os.path.dirname(source_path[0])
Archive(self._archive_source).extractall(extract_dir)
if extract_dir == os.path.dirname(source_path[0]):
tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
Archive(self._archive_source).extractall(tmp_dir)
if not extract_dir:
os.remove(self._archive_source)
super().__init__(
source_path=[extract_dir],
source_path=[tmp_dir],
step=step,
start=start,
stop=stop,
Expand All @@ -239,7 +241,8 @@ def __init__(self,
start=0,
stop=None,
dimension=DimensionType.DIM_2D,
sorting_method=SortingMethod.LEXICOGRAPHICAL):
sorting_method=SortingMethod.LEXICOGRAPHICAL,
extract_dir=None):
if not source_path:
raise Exception('No PDF found')

Expand All @@ -252,15 +255,16 @@ def _make_name():
yield '{}{:09d}.jpeg'.format(_basename, page_num)

from pdf2image import convert_from_path
self._tmp_dir = os.path.dirname(source_path[0])
self._tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
os.makedirs(self._tmp_dir, exist_ok=True)

# Avoid OOM: https://github.com/openvinotoolkit/cvat/issues/940
paths = convert_from_path(self._pdf_source,
last_page=stop, paths_only=True,
output_folder=self._tmp_dir, fmt="jpeg", output_file=_make_name())

os.remove(source_path[0])
if not extract_dir:
os.remove(source_path[0])

super().__init__(
source_path=paths,
Expand All @@ -278,9 +282,10 @@ def __init__(self,
start=0,
stop=None,
dimension=DimensionType.DIM_2D,
sorting_method=SortingMethod.LEXICOGRAPHICAL):
sorting_method=SortingMethod.LEXICOGRAPHICAL,
extract_dir=None):
self._zip_source = zipfile.ZipFile(source_path[0], mode='r')
self.extract_dir = source_path[1] if len(source_path) > 1 else None
self.extract_dir = extract_dir
file_list = [f for f in self._zip_source.namelist() if files_to_ignore(f) and get_mime(f) == 'image']
super().__init__(file_list,
step=step,
Expand Down
9 changes: 4 additions & 5 deletions cvat/apps/engine/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,17 +325,16 @@ def _create_thread(tid, data, isImport=False):
data['sorting_method'] in {models.SortingMethod.RANDOM, models.SortingMethod.PREDEFINED}:
raise Exception("It isn't supported to import the task that was created without cache but with random/predefined sorting")

if media_type in {'archive', 'zip'} and db_data.storage == models.StorageChoice.SHARE:
source_paths.append(db_data.get_upload_dirname())
upload_dir = db_data.get_upload_dirname()
db_data.storage = models.StorageChoice.LOCAL

details = {
'source_path': source_paths,
'step': db_data.get_frame_step(),
'start': db_data.start_frame,
'stop': data['stop_frame'],
}
if media_type in {'archive', 'zip', 'pdf'} and db_data.storage == models.StorageChoice.SHARE:
details['extract_dir'] = db_data.get_upload_dirname()
upload_dir = db_data.get_upload_dirname()
db_data.storage = models.StorageChoice.LOCAL
if media_type != 'video':
details['sorting_method'] = data['sorting_method']
extractor = MEDIA_TYPES[media_type]['extractor'](**details)
Expand Down
34 changes: 31 additions & 3 deletions cvat/apps/engine/tests/test_rest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2767,6 +2767,13 @@ def setUpClass(cls):
shutil.rmtree(root_path)
cls._image_sizes[filename] = image_sizes

file_name = 'test_1.pdf'
path = os.path.join(settings.SHARE_ROOT, file_name)
img_sizes, data = generate_pdf_file(file_name, page_count=5)
with open(path, "wb") as pdf_file:
pdf_file.write(data.read())
cls._image_sizes[file_name] = img_sizes

generate_manifest_file(data_type='video', manifest_path=os.path.join(settings.SHARE_ROOT, 'videos', 'manifest.jsonl'),
sources=[os.path.join(settings.SHARE_ROOT, 'videos', 'test_video_1.mp4')])

Expand Down Expand Up @@ -2804,6 +2811,9 @@ def tearDownClass(cls):
path = os.path.join(settings.SHARE_ROOT, "manifest.jsonl")
os.remove(path)

path = os.path.join(settings.SHARE_ROOT, "test_1.pdf")
os.remove(path)

def _run_api_v1_tasks_id_data_post(self, tid, user, data):
with ForceLogin(user, self.client):
response = self.client.post('/api/v1/tasks/{}/data'.format(tid),
Expand Down Expand Up @@ -2886,10 +2896,12 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_
db_data = Task.objects.get(pk=task_id).data
self.assertEqual(expected_storage_method, db_data.storage_method)
self.assertEqual(expected_uploaded_data_location, db_data.storage)
# check if used share without copying inside and files doesn`t exist in ../raw/
# check if used share without copying inside and files doesn`t exist in ../raw/ and exist in share
if expected_uploaded_data_location is StorageChoice.SHARE:
self.assertEqual(False,
os.path.exists(os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))))
raw_file_path = os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))
share_file_path = os.path.join(settings.SHARE_ROOT, next(iter(data.values())))
self.assertEqual(False, os.path.exists(raw_file_path))
self.assertEqual(True, os.path.exists(share_file_path))

# check preview
response = self._get_preview(task_id, user)
Expand Down Expand Up @@ -2956,6 +2968,10 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_
for f in source_files:
if zipfile.is_zipfile(f):
source_images.extend(self._extract_zip_chunk(f, dimension=dimension))
elif isinstance(f, str) and f.endswith('.pdf'):
with open(f, 'rb') as pdf_file:
source_images.extend(convert_from_bytes(pdf_file.read(),
fmt='png'))
elif isinstance(f, io.BytesIO) and \
str(getattr(f, 'name', None)).endswith('.pdf'):
source_images.extend(convert_from_bytes(f.getvalue(),
Expand Down Expand Up @@ -3475,6 +3491,18 @@ def _test_api_v1_tasks_id_data(self, user):
self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
image_sizes, StorageMethodChoice.CACHE, StorageChoice.SHARE)

task_spec.update([('name', 'task pdf in the shared folder #30')])
task_data = {
"server_files[0]": "test_1.pdf",
"image_quality": 70,
"copy_data": False,
"use_cache": True,
}
image_sizes = self._image_sizes[task_data["server_files[0]"]]

self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
image_sizes, StorageMethodChoice.CACHE, StorageChoice.LOCAL)

def test_api_v1_tasks_id_data_admin(self):
self._test_api_v1_tasks_id_data(self.admin)

Expand Down

0 comments on commit cde33ac

Please sign in to comment.