diff --git a/CHANGELOG.md b/CHANGELOG.md index ec03b776985e..9a6294bdaea8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed multiple errors which arises when polygon is of length 5 or less () - Fixed task creation from PDF () - Fixed CVAT format import for frame stepped tasks () +- Fixed the reading problem with large PDFs () - Fixed unnecessary pyhash dependency () ### Security diff --git a/cvat/apps/engine/cache.py b/cvat/apps/engine/cache.py index 2f8b0a628c35..bfe79a4e18ee 100644 --- a/cvat/apps/engine/cache.py +++ b/cvat/apps/engine/cache.py @@ -2,14 +2,17 @@ # # SPDX-License-Identifier: MIT +import os +from io import BytesIO + from diskcache import Cache from django.conf import settings -from cvat.apps.engine.media_extractors import (Mpeg4ChunkWriter, ZipChunkWriter, - Mpeg4CompressedChunkWriter, ZipCompressedChunkWriter) + +from cvat.apps.engine.media_extractors import (Mpeg4ChunkWriter, + Mpeg4CompressedChunkWriter, ZipChunkWriter, ZipCompressedChunkWriter) from cvat.apps.engine.models import DataChoice -from .prepare import PrepareInfo -import os -from io import BytesIO +from cvat.apps.engine.prepare import PrepareInfo + class CacheInteraction: def __init__(self): @@ -27,7 +30,7 @@ def get_buff_mime(self, chunk_number, quality, db_data): return chunk, tag def prepare_chunk_buff(self, db_data, quality, chunk_number): - from cvat.apps.engine.frame_provider import FrameProvider + from cvat.apps.engine.frame_provider import FrameProvider # TODO: remove circular dependency extractor_classes = { FrameProvider.Quality.COMPRESSED : Mpeg4CompressedChunkWriter if db_data.compressed_chunk_type == DataChoice.VIDEO else ZipCompressedChunkWriter, FrameProvider.Quality.ORIGINAL : Mpeg4ChunkWriter if db_data.original_chunk_type == DataChoice.VIDEO else ZipChunkWriter, @@ -54,4 +57,4 @@ def prepare_chunk_buff(self, db_data, quality, chunk_number): return buff, mime_type def save_chunk(self, db_data_id, chunk_number, quality, buff, mime_type): - self._cache.set('{}_{}_{}'.format(db_data_id, chunk_number, quality), buff, tag=mime_type) \ No newline at end of file + self._cache.set('{}_{}_{}'.format(db_data_id, chunk_number, quality), buff, tag=mime_type) diff --git a/cvat/apps/engine/frame_provider.py b/cvat/apps/engine/frame_provider.py index 8f56463a45f7..ed96bf99f3c0 100644 --- a/cvat/apps/engine/frame_provider.py +++ b/cvat/apps/engine/frame_provider.py @@ -9,10 +9,11 @@ import numpy as np from PIL import Image +from cvat.apps.engine.cache import CacheInteraction from cvat.apps.engine.media_extractors import VideoReader, ZipReader from cvat.apps.engine.mime_types import mimetypes from cvat.apps.engine.models import DataChoice, StorageMethodChoice -from .cache import CacheInteraction + class RandomAccessIterator: def __init__(self, iterable): diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py index 08a660c5d68a..d9eead2b9f7e 100644 --- a/cvat/apps/engine/media_extractors.py +++ b/cvat/apps/engine/media_extractors.py @@ -7,6 +7,7 @@ import shutil import zipfile import io +import itertools from abc import ABC, abstractmethod import av @@ -65,9 +66,16 @@ def _get_preview(obj): return preview.convert('RGB') @abstractmethod - def get_image_size(self): + def get_image_size(self, i): pass + def __len__(self): + return len(self.frame_range) + + @property + def frame_range(self): + return range(self._start, self._stop, self._step) + class ImageListReader(IMediaReader): def __init__(self, source_path, step=1, start=0, stop=None): if not source_path: @@ -104,8 +112,8 @@ def get_preview(self): fp = open(self._source_path[0], "rb") return self._get_preview(fp) - def get_image_size(self): - img = Image.open(self._source_path[0]) + def get_image_size(self, i): + img = Image.open(self._source_path[i]) return img.width, img.height class DirectoryReader(ImageListReader): @@ -127,6 +135,7 @@ class ArchiveReader(DirectoryReader): def __init__(self, source_path, step=1, start=0, stop=None): self._archive_source = source_path[0] Archive(self._archive_source).extractall(os.path.dirname(source_path[0])) + os.remove(self._archive_source) super().__init__( source_path=[os.path.dirname(source_path[0])], step=step, @@ -134,37 +143,37 @@ def __init__(self, source_path, step=1, start=0, stop=None): stop=stop, ) - def __del__(self): - os.remove(self._archive_source) - -class PdfReader(DirectoryReader): +class PdfReader(ImageListReader): def __init__(self, source_path, step=1, start=0, stop=None): if not source_path: raise Exception('No PDF found') - from pdf2image import convert_from_path self._pdf_source = source_path[0] - self._tmp_dir = create_tmp_dir() - file_ = convert_from_path(self._pdf_source) - basename = os.path.splitext(os.path.basename(self._pdf_source))[0] - for page_num, page in enumerate(file_): - output = os.path.join(self._tmp_dir, '{}{:09d}.jpeg'.format(basename, page_num)) - page.save(output, 'JPEG') + + _basename = os.path.splitext(os.path.basename(self._pdf_source))[0] + _counter = itertools.count() + def _make_name(): + for page_num in _counter: + yield '{}{:09d}.jpeg'.format(_basename, page_num) + + from pdf2image import convert_from_path + self._tmp_dir = os.path.dirname(source_path[0]) + os.makedirs(self._tmp_dir, exist_ok=True) + + # Avoid OOM: https://github.com/openvinotoolkit/cvat/issues/940 + paths = convert_from_path(self._pdf_source, + last_page=stop, paths_only=True, + output_folder=self._tmp_dir, fmt="jpeg", output_file=_make_name()) + + os.remove(source_path[0]) super().__init__( - source_path=[self._tmp_dir], + source_path=paths, step=step, start=start, stop=stop, ) - def __del__(self): - delete_tmp_dir(self._tmp_dir) - - def get_path(self, i): - base_dir = os.path.dirname(self._pdf_source) - return os.path.join(base_dir, os.path.relpath(self._source_path[i], self._tmp_dir)) - class ZipReader(ImageListReader): def __init__(self, source_path, step=1, start=0, stop=None): self._zip_source = zipfile.ZipFile(source_path[0], mode='r') @@ -178,8 +187,8 @@ def get_preview(self): io_image = io.BytesIO(self._zip_source.read(self._source_path[0])) return self._get_preview(io_image) - def get_image_size(self): - img = Image.open(io.BytesIO(self._zip_source.read(self._source_path[0]))) + def get_image_size(self, i): + img = Image.open(io.BytesIO(self._zip_source.read(self._source_path[i]))) return img.width, img.height def get_image(self, i): @@ -243,7 +252,7 @@ def get_preview(self): preview = next(container.decode(stream)) return self._get_preview(preview.to_image()) - def get_image_size(self): + def get_image_size(self, i): image = (next(iter(self)))[0] return image.width, image.height diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index e89877295088..4d2e6cafded0 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -273,7 +273,7 @@ def update_progress(progress): # calculate chunk size if it isn't specified if db_data.chunk_size is None: if isinstance(compressed_chunk_writer, ZipCompressedChunkWriter): - w, h = extractor.get_image_size() + w, h = extractor.get_image_size(0) area = h * w db_data.chunk_size = max(2, min(72, 36 * 1920 * 1080 // area)) else: @@ -285,58 +285,48 @@ def update_progress(progress): if settings.USE_CACHE and db_data.storage_method == StorageMethodChoice.CACHE: for media_type, media_files in media.items(): - if media_files: - if task_mode == MEDIA_TYPES['video']['mode']: - try: - analyzer = AnalyzeVideo(source_path=os.path.join(upload_dir, media_files[0])) - analyzer.check_type_first_frame() - analyzer.check_video_timestamps_sequences() - - meta_info = PrepareInfo(source_path=os.path.join(upload_dir, media_files[0]), - meta_path=os.path.join(upload_dir, 'meta_info.txt')) - meta_info.save_key_frames() - meta_info.check_seek_key_frames() - meta_info.save_meta_info() - - all_frames = meta_info.get_task_size() - db_data.size = len(range(db_data.start_frame, min(data['stop_frame'] + 1 if data['stop_frame'] else all_frames, all_frames), db_data.get_frame_step())) - video_path = os.path.join(upload_dir, media_files[0]) - frame = meta_info.key_frames.get(next(iter(meta_info.key_frames))) - video_size = (frame.width, frame.height) - - except Exception: - db_data.storage_method = StorageMethodChoice.FILE_SYSTEM - - else:#images,archive - counter_ = itertools.count() - if isinstance(extractor, MEDIA_TYPES['archive']['extractor']): - media_files = [os.path.relpath(path, upload_dir) for path in extractor._source_path] - elif isinstance(extractor, (MEDIA_TYPES['zip']['extractor'], MEDIA_TYPES['pdf']['extractor'])): - media_files = extractor._source_path - - numbers_sequence = range(db_data.start_frame, min(data['stop_frame'] if data['stop_frame'] else len(media_files), len(media_files)), db_data.get_frame_step()) - m_paths = [] - m_paths = [(path, numb) for numb, path in enumerate(sorted(media_files)) if numb in numbers_sequence] - - for chunk_number, media_paths in itertools.groupby(m_paths, lambda x: next(counter_) // db_data.chunk_size): - media_paths = list(media_paths) - img_sizes = [] - from PIL import Image - with open(db_data.get_dummy_chunk_path(chunk_number), 'w') as dummy_chunk: - for path, _ in media_paths: - dummy_chunk.write(path+'\n') - img_sizes += [Image.open(os.path.join(upload_dir, path)).size] - - db_data.size += len(media_paths) - db_images.extend([ - models.Image( - data=db_data, - path=data[0], - frame=data[1], - width=size[0], - height=size[1]) - for data, size in zip(media_paths, img_sizes) - ]) + if not media_files: + continue + + if task_mode == MEDIA_TYPES['video']['mode']: + try: + analyzer = AnalyzeVideo(source_path=os.path.join(upload_dir, media_files[0])) + analyzer.check_type_first_frame() + analyzer.check_video_timestamps_sequences() + + meta_info = PrepareInfo(source_path=os.path.join(upload_dir, media_files[0]), + meta_path=os.path.join(upload_dir, 'meta_info.txt')) + meta_info.save_key_frames() + meta_info.check_seek_key_frames() + meta_info.save_meta_info() + + all_frames = meta_info.get_task_size() + db_data.size = len(range(db_data.start_frame, min(data['stop_frame'] + 1 if data['stop_frame'] else all_frames, all_frames), db_data.get_frame_step())) + video_path = os.path.join(upload_dir, media_files[0]) + frame = meta_info.key_frames.get(next(iter(meta_info.key_frames))) + video_size = (frame.width, frame.height) + + except Exception: + db_data.storage_method = StorageMethodChoice.FILE_SYSTEM + + else:#images,archive + db_data.size = len(extractor) + + counter = itertools.count() + for chunk_number, chunk_frames in itertools.groupby(extractor.frame_range, lambda x: next(counter) // db_data.chunk_size): + chunk_paths = [(extractor.get_path(i), i) for i in chunk_frames] + img_sizes = [] + with open(db_data.get_dummy_chunk_path(chunk_number), 'w') as dummy_chunk: + for path, frame_id in chunk_paths: + dummy_chunk.write(path + '\n') + img_sizes.append(extractor.get_image_size(frame_id)) + + db_images.extend([ + models.Image(data=db_data, + path=os.path.relpath(path, upload_dir), + frame=frame, width=w, height=h) + for (path, frame), (w, h) in zip(chunk_paths, img_sizes) + ]) if db_data.storage_method == StorageMethodChoice.FILE_SYSTEM or not settings.USE_CACHE: counter = itertools.count() @@ -383,5 +373,5 @@ def update_progress(progress): preview = extractor.get_preview() preview.save(db_data.get_preview_path()) - slogger.glob.info("Founded frames {} for Data #{}".format(db_data.size, db_data.id)) + slogger.glob.info("Found frames {} for Data #{}".format(db_data.size, db_data.id)) _save_task_to_db(db_task) \ No newline at end of file diff --git a/cvat/apps/engine/tests/_test_rest_api.py b/cvat/apps/engine/tests/_test_rest_api.py index 374a4cd2e213..cddd36bb3b48 100644 --- a/cvat/apps/engine/tests/_test_rest_api.py +++ b/cvat/apps/engine/tests/_test_rest_api.py @@ -70,6 +70,7 @@ def _setUpModule(): import av import numpy as np +from pdf2image import convert_from_bytes from django.conf import settings from django.contrib.auth.models import Group, User from django.http import HttpResponse @@ -1527,6 +1528,19 @@ def generate_zip_archive_file(filename, count): zip_buf.seek(0) return image_sizes, zip_buf +def generate_pdf_file(filename, page_count=1): + images = [Image.fromarray(np.ones((50, 100, 3), dtype=np.uint8)) + for _ in range(page_count)] + image_sizes = [img.size for img in images] + + file_buf = BytesIO() + images[0].save(file_buf, 'pdf', save_all=True, resolution=200, + append_images=images[1:]) + + file_buf.name = filename + file_buf.seek(0) + return image_sizes, file_buf + class TaskDataAPITestCase(APITestCase): _image_sizes = {} @@ -1754,6 +1768,10 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_ for f in source_files: if zipfile.is_zipfile(f): source_images.extend(self._extract_zip_chunk(f)) + elif isinstance(f, io.BytesIO) and \ + str(getattr(f, 'name', None)).endswith('.pdf'): + source_images.extend(convert_from_bytes(f.getvalue(), + fmt='png')) else: source_images.append(Image.open(f)) @@ -1919,7 +1937,7 @@ def _test_api_v1_tasks_id_data(self, user): self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET, image_sizes) task_spec = { - "name": "use_cache video task #8", + "name": "cached video task #8", "overlap": 0, "segment_size": 0, "labels": [ @@ -1937,10 +1955,10 @@ def _test_api_v1_tasks_id_data(self, user): image_sizes = self._image_sizes[task_data["server_files[0]"]] self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.VIDEO, - self.ChunkType.VIDEO, image_sizes, StorageMethodChoice.CACHE) + self.ChunkType.VIDEO, image_sizes, StorageMethodChoice.CACHE) task_spec = { - "name": "use_cache images task #9", + "name": "cached images task #9", "overlap": 0, "segment_size": 0, "labels": [ @@ -1963,10 +1981,10 @@ def _test_api_v1_tasks_id_data(self, user): ] self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, - self.ChunkType.IMAGESET, image_sizes, StorageMethodChoice.CACHE) + self.ChunkType.IMAGESET, image_sizes, StorageMethodChoice.CACHE) task_spec = { - "name": "my zip archive task #10", + "name": "my cached zip archive task #10", "overlap": 0, "segment_size": 0, "labels": [ @@ -1984,7 +2002,49 @@ def _test_api_v1_tasks_id_data(self, user): image_sizes = self._image_sizes[task_data["server_files[0]"]] self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, - self.ChunkType.IMAGESET, image_sizes, StorageMethodChoice.CACHE) + self.ChunkType.IMAGESET, image_sizes, StorageMethodChoice.CACHE) + + task_spec = { + "name": "my cached pdf task #11", + "overlap": 0, + "segment_size": 0, + "labels": [ + {"name": "car"}, + {"name": "person"}, + ] + } + + image_sizes, document = generate_pdf_file("test_pdf_1.pdf", 5) + + task_data = { + "client_files[0]": document, + "image_quality": 70, + "use_cache": True + } + + self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, + self.ChunkType.IMAGESET, self.ChunkType.IMAGESET, + image_sizes, StorageMethodChoice.CACHE) + + task_spec = { + "name": "my pdf task #12", + "overlap": 0, + "segment_size": 0, + "labels": [ + {"name": "car"}, + {"name": "person"}, + ] + } + + image_sizes, document = generate_pdf_file("test_pdf_2.pdf", 4) + + task_data = { + "client_files[0]": document, + "image_quality": 70, + } + + self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, + self.ChunkType.IMAGESET, self.ChunkType.IMAGESET, image_sizes) def test_api_v1_tasks_id_data_admin(self): self._test_api_v1_tasks_id_data(self.admin)