Skip to content

Commit

Permalink
Improve caching mechanism for json
Browse files Browse the repository at this point in the history
allow for only a single copy of a dict to exist in memory at any time
  • Loading branch information
chrisjsewell committed Oct 31, 2020
1 parent cbcaec6 commit b44e350
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 45 deletions.
92 changes: 71 additions & 21 deletions aiida/tools/importexport/archive/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
###########################################################################
"""Shared resources for the archive."""
from collections import OrderedDict
import copy
import dataclasses
import os
from pathlib import Path
import tarfile
from types import TracebackType
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Type, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
import zipfile

from aiida.common import json # handles byte dumps
Expand Down Expand Up @@ -231,32 +232,59 @@ class CacheFolder:
The class can be used as a context manager, and will flush the cache on exit::
with CacheFolder(path) as folder:
# this stored in memory (no file system write)
# this stored in memory (no disk write)
folder.write_text('path/to/file.txt', 'content')
# this will be read from memory
text = folder.read_text('path/to/file.txt')
# all path/to/file.txt will now have been written to the file system
# all path/to/file.txt will now have been written to disk
"""

def __init__(self, path: Union[Path, str], *, limit: int = 100, encoding: str = 'utf8'):
def __init__(self, path: Union[Path, str], *, encoding: str = 'utf8'):
"""Initialise cached folder.
:param path: folder path to cache
:param limit: maximum number of files to cache ()
:param encoding: encoding of text to read/write
"""
self._path = Path(path)
# dict mapping path -> (type, content)
self._cache = OrderedDict() # type: ignore
self._max_items = limit
self._encoding = encoding
self._max_items = 100 # maximum limit of files to store in memory

def _write_object(self, path: str, ctype: str, content: Any):
"""Write an object from the cache to disk.
:param path: relative path of file
:param ctype: the type of the content
:param content: the content to write
"""
if ctype == 'text':
text = content
elif ctype == 'json':
text = json.dumps(content)
else:
raise TypeError(f'Unknown content type: {ctype}')

(self._path / path).write_text(text, encoding=self._encoding)

def flush(self):
"""Flush the cache."""
for path, content in self._cache.items():
(self._path / path).write_text(content, encoding=self._encoding)
for path, (ctype, content) in self._cache.items():
self._write_object(path, ctype, content)

def _limit_cache(self):
"""Ensure the cache does not exceed a set limit.
Content is uncached on a First-In-First-Out basis.
"""
while len(self._cache) > self._max_items:
path, (ctype, content) = self._cache.popitem(last=False)
self._write_object(path, ctype, content)

def get_path(self, flush=True) -> Path:
"""Return the path.
Expand All @@ -274,38 +302,60 @@ def write_text(self, path: str, content: str):
:param path: path relative to base folder
"""
self._cache[path] = content
if len(self._cache) > self._max_items:
path, content = self._cache.popitem(last=False)
(self._path / path).write_text(content, encoding=self._encoding)
assert isinstance(content, str)
self._cache[path] = ('text', content)
self._limit_cache()

def read_text(self, path) -> str:
"""write text from the cache or base folder.
:param path: path relative to base folder
"""
if path in self._cache:
return self._cache[path]
return (self._path / path).read_text(self._encoding)
if path not in self._cache:
return (self._path / path).read_text(self._encoding)
ctype, content = self._cache[path]
if ctype == 'text':
return content
if ctype == 'json':
return json.dumps(content)

raise TypeError(f"content of type '{ctype}' could not be converted to text")

def write_json(self, path: str, data: dict):
"""write dict to the cache as json.
:param path: path relative to base folder
"""
content = json.dumps(data)
self.write_text(path, content)
json.dumps(data) # make sure that the data can be converted to json
self._cache[path] = ('json', data)
self._limit_cache()

def read_json(self, path) -> dict:
"""write text from the cache or base folder.
def load_json(self, path: str, ensure_copy: bool = False) -> Tuple[bool, dict]:
"""Load a json file from the cache or base folder.
Important: if the dict is returned directly from the cache, any mutations will affect the cached dict.
:param path: path relative to base folder
:param ensure_copy: ensure the dict is a copy of that from the cache
:returns: (from cache, the content)
If from cache, mutations will directly affect the cache
"""
content = self.read_text(path)
return json.loads(content)
if path not in self._cache:
return False, json.loads((self._path / path).read_text(self._encoding))

ctype, content = self._cache[path]
if ctype == 'text':
return False, json.loads(content)
if ctype == 'json':
if ensure_copy:
return False, copy.deepcopy(content)
return True, content

raise TypeError(f"content of type '{ctype}' could not be converted to a dict")

def remove_file(self, path):
"""Remove a file from both the cache and base folder (if present).
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v01_to_v02.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ def migrate_v1_to_v2(folder: CacheFolder):
old_start = 'aiida.djsite'
new_start = 'aiida.backends.djsite'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

for field in ['export_data']:
for key in list(data[field]):
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v02_to_v03.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ class NodeType(enum.Enum):
'aiida.backends.djsite.db.models.DbAttribute': 'Attribute'
}

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Create a mapping from node uuid to node type
mapping = {}
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v03_to_v04.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,12 +441,12 @@ def migrate_v3_to_v4(folder: CacheFolder):
old_version = '0.3'
new_version = '0.4'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Apply migrations in correct sequential order
migration_base_data_plugin_type_string(data)
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v04_to_v05.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ def migrate_v4_to_v5(folder: CacheFolder):
old_version = '0.4'
new_version = '0.5'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')
# Apply migrations
migration_drop_node_columns_nodeversion_public(metadata, data)
migration_drop_computer_transport_params(metadata, data)
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v05_to_v06.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,12 @@ def migrate_v5_to_v6(folder: CacheFolder):
old_version = '0.5'
new_version = '0.6'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Apply migrations
migration_serialize_datetime_objects(data)
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v06_to_v07.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ def migrate_v6_to_v7(folder: CacheFolder):
old_version = '0.6'
new_version = '0.7'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Apply migrations
data_migration_legacy_process_attributes(data)
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v07_to_v08.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ def migrate_v7_to_v8(folder: CacheFolder):
old_version = '0.7'
new_version = '0.8'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Apply migrations
migration_default_link_label(data)
Expand Down
4 changes: 2 additions & 2 deletions aiida/tools/importexport/archive/migrations/v08_to_v09.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ def migrate_v8_to_v9(folder: CacheFolder):
old_version = '0.8'
new_version = '0.9'

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')

verify_metadata_version(metadata, old_version)
update_metadata(metadata, new_version)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Apply migrations
migration_dbgroup_type_string(data)
Expand Down
6 changes: 3 additions & 3 deletions tests/tools/importexport/migration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ def _migrate(filename_archive, version_old, version_new, migration_method, archi
raise ValueError('invalid file format, expected either a zip archive or gzipped tarball')

folder = CacheFolder(out_path)
old_metadata = folder.read_json('metadata.json')
_, old_metadata = folder.load_json('metadata.json')
verify_metadata_version(old_metadata, version=version_old)

migration_method(folder)

metadata = folder.read_json('metadata.json')
_, metadata = folder.load_json('metadata.json')
verify_metadata_version(metadata, version=version_new)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

return metadata, data

Expand Down
4 changes: 2 additions & 2 deletions tests/tools/importexport/migration/test_migration_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def test_migrations(migration_data, tmp_path):
folder = CacheFolder(out_path)
migration_method(folder)

metadata_old = folder.read_json('metadata.json')
data_old = folder.read_json('data.json')
_, metadata_old = folder.load_json('metadata.json')
_, data_old = folder.load_json('data.json')

verify_metadata_version(metadata_old, version=version_new)

Expand Down
6 changes: 3 additions & 3 deletions tests/tools/importexport/migration/test_v03_to_v04.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def test_migrate_external(external_archive, tmp_path):
# Migrate to v0.4
folder = CacheFolder(out_path)
migrate_v3_to_v4(folder)
metadata = folder.read_json('metadata.json')
data = folder.read_json('data.json')
_, metadata = folder.load_json('metadata.json')
_, data = folder.load_json('data.json')
verify_metadata_version(metadata, version='0.4')

## Following checks are based on the archive-file
Expand Down Expand Up @@ -352,7 +352,7 @@ def test_illegal_create_links(external_archive, tmp_path):
folder = CacheFolder(out_path)
migrate_v3_to_v4(folder)

data = folder.read_json('data.json')
_, data = folder.load_json('data.json')

# Check illegal create links were removed
assert len(data['links_uuid']) == links_count_migrated, (
Expand Down

0 comments on commit b44e350

Please sign in to comment.