Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue: Original pdf file is deleted #3967

Merged
merged 5 commits into from
Dec 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed task creating with large files via webpage (<https://github.com/openvinotoolkit/cvat/pull/3692>)
- Added information to export CVAT_HOST when performing local installation for accessing over network (<https://github.com/openvinotoolkit/cvat/pull/4014>)
- Fixed possible color collisions in the generated colormap (<https://github.com/openvinotoolkit/cvat/pull/4007>)
- Original pdf file is deleted when using share(<https://github.com/openvinotoolkit/cvat/pull/3967>)

### Security
- TDB
Expand Down
25 changes: 15 additions & 10 deletions cvat/apps/engine/media_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,16 @@ def __init__(self,
start=0,
stop=None,
dimension=DimensionType.DIM_2D,
sorting_method=SortingMethod.LEXICOGRAPHICAL):
sorting_method=SortingMethod.LEXICOGRAPHICAL,
extract_dir=None):

self._archive_source = source_path[0]
extract_dir = source_path[1] if len(source_path) > 1 else os.path.dirname(source_path[0])
Archive(self._archive_source).extractall(extract_dir)
if extract_dir == os.path.dirname(source_path[0]):
tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
Archive(self._archive_source).extractall(tmp_dir)
if not extract_dir:
os.remove(self._archive_source)
super().__init__(
source_path=[extract_dir],
source_path=[tmp_dir],
step=step,
start=start,
stop=stop,
Expand All @@ -239,7 +241,8 @@ def __init__(self,
start=0,
stop=None,
dimension=DimensionType.DIM_2D,
sorting_method=SortingMethod.LEXICOGRAPHICAL):
sorting_method=SortingMethod.LEXICOGRAPHICAL,
extract_dir=None):
if not source_path:
raise Exception('No PDF found')

Expand All @@ -252,15 +255,16 @@ def _make_name():
yield '{}{:09d}.jpeg'.format(_basename, page_num)

from pdf2image import convert_from_path
self._tmp_dir = os.path.dirname(source_path[0])
self._tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
os.makedirs(self._tmp_dir, exist_ok=True)

# Avoid OOM: https://github.com/openvinotoolkit/cvat/issues/940
paths = convert_from_path(self._pdf_source,
last_page=stop, paths_only=True,
output_folder=self._tmp_dir, fmt="jpeg", output_file=_make_name())

os.remove(source_path[0])
if not extract_dir:
os.remove(source_path[0])

super().__init__(
source_path=paths,
Expand All @@ -278,9 +282,10 @@ def __init__(self,
start=0,
stop=None,
dimension=DimensionType.DIM_2D,
sorting_method=SortingMethod.LEXICOGRAPHICAL):
sorting_method=SortingMethod.LEXICOGRAPHICAL,
extract_dir=None):
self._zip_source = zipfile.ZipFile(source_path[0], mode='r')
self.extract_dir = source_path[1] if len(source_path) > 1 else None
self.extract_dir = extract_dir
file_list = [f for f in self._zip_source.namelist() if files_to_ignore(f) and get_mime(f) == 'image']
super().__init__(file_list,
step=step,
Expand Down
9 changes: 4 additions & 5 deletions cvat/apps/engine/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,17 +325,16 @@ def _create_thread(tid, data, isImport=False):
data['sorting_method'] in {models.SortingMethod.RANDOM, models.SortingMethod.PREDEFINED}:
raise Exception("It isn't supported to import the task that was created without cache but with random/predefined sorting")

if media_type in {'archive', 'zip'} and db_data.storage == models.StorageChoice.SHARE:
source_paths.append(db_data.get_upload_dirname())
upload_dir = db_data.get_upload_dirname()
db_data.storage = models.StorageChoice.LOCAL

details = {
'source_path': source_paths,
'step': db_data.get_frame_step(),
'start': db_data.start_frame,
'stop': data['stop_frame'],
}
if media_type in {'archive', 'zip', 'pdf'} and db_data.storage == models.StorageChoice.SHARE:
details['extract_dir'] = db_data.get_upload_dirname()
upload_dir = db_data.get_upload_dirname()
db_data.storage = models.StorageChoice.LOCAL
if media_type != 'video':
details['sorting_method'] = data['sorting_method']
extractor = MEDIA_TYPES[media_type]['extractor'](**details)
Expand Down
34 changes: 31 additions & 3 deletions cvat/apps/engine/tests/test_rest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2767,6 +2767,13 @@ def setUpClass(cls):
shutil.rmtree(root_path)
cls._image_sizes[filename] = image_sizes

file_name = 'test_1.pdf'
path = os.path.join(settings.SHARE_ROOT, file_name)
img_sizes, data = generate_pdf_file(file_name, page_count=5)
with open(path, "wb") as pdf_file:
pdf_file.write(data.read())
cls._image_sizes[file_name] = img_sizes

generate_manifest_file(data_type='video', manifest_path=os.path.join(settings.SHARE_ROOT, 'videos', 'manifest.jsonl'),
sources=[os.path.join(settings.SHARE_ROOT, 'videos', 'test_video_1.mp4')])

Expand Down Expand Up @@ -2804,6 +2811,9 @@ def tearDownClass(cls):
path = os.path.join(settings.SHARE_ROOT, "manifest.jsonl")
os.remove(path)

path = os.path.join(settings.SHARE_ROOT, "test_1.pdf")
os.remove(path)

def _run_api_v1_tasks_id_data_post(self, tid, user, data):
with ForceLogin(user, self.client):
response = self.client.post('/api/v1/tasks/{}/data'.format(tid),
Expand Down Expand Up @@ -2886,10 +2896,12 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_
db_data = Task.objects.get(pk=task_id).data
self.assertEqual(expected_storage_method, db_data.storage_method)
self.assertEqual(expected_uploaded_data_location, db_data.storage)
# check if used share without copying inside and files doesn`t exist in ../raw/
# check if used share without copying inside and files doesn`t exist in ../raw/ and exist in share
if expected_uploaded_data_location is StorageChoice.SHARE:
self.assertEqual(False,
os.path.exists(os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))))
raw_file_path = os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))
share_file_path = os.path.join(settings.SHARE_ROOT, next(iter(data.values())))
self.assertEqual(False, os.path.exists(raw_file_path))
self.assertEqual(True, os.path.exists(share_file_path))

# check preview
response = self._get_preview(task_id, user)
Expand Down Expand Up @@ -2956,6 +2968,10 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_
for f in source_files:
if zipfile.is_zipfile(f):
source_images.extend(self._extract_zip_chunk(f, dimension=dimension))
elif isinstance(f, str) and f.endswith('.pdf'):
with open(f, 'rb') as pdf_file:
source_images.extend(convert_from_bytes(pdf_file.read(),
fmt='png'))
elif isinstance(f, io.BytesIO) and \
str(getattr(f, 'name', None)).endswith('.pdf'):
source_images.extend(convert_from_bytes(f.getvalue(),
Expand Down Expand Up @@ -3475,6 +3491,18 @@ def _test_api_v1_tasks_id_data(self, user):
self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
image_sizes, StorageMethodChoice.CACHE, StorageChoice.SHARE)

task_spec.update([('name', 'task pdf in the shared folder #30')])
task_data = {
"server_files[0]": "test_1.pdf",
"image_quality": 70,
"copy_data": False,
"use_cache": True,
}
image_sizes = self._image_sizes[task_data["server_files[0]"]]

self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
image_sizes, StorageMethodChoice.CACHE, StorageChoice.LOCAL)

def test_api_v1_tasks_id_data_admin(self):
self._test_api_v1_tasks_id_data(self.admin)

Expand Down