Local file cache (#696)

IAMconsortium · Sep 14, 2022 · d7d9b45 · d7d9b45
1 parent d2d7a84
commit d7d9b45
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 4 deletions.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -11,6 +11,7 @@ dependency for better performance.
 - [#701](https://github.com/IAMconsortium/pyam/pull/701) Add **xlsxwriter** as dependency to improve `to_excel()` performance
 - [#699](https://github.com/IAMconsortium/pyam/pull/699) Add filter options to IIASA API `index()`, `meta()` and `properties()` methods
 - [#697](https://github.com/IAMconsortium/pyam/pull/697) Add warning if IIASA API returns empty result
+- [#696](https://github.com/IAMconsortium/pyam/pull/696) Added ability to load preferentially from a local cache
 - [#695](https://github.com/IAMconsortium/pyam/pull/695) Remove unused meta levels during initialization
 - [#688](https://github.com/IAMconsortium/pyam/pull/688) Remove ixmp as optional dependency
 - [#684](https://github.com/IAMconsortium/pyam/pull/684) Use new IIASA-manager API with token refresh 

diff --git a/doc/source/tutorials/iiasa_dbs.ipynb b/doc/source/tutorials/iiasa_dbs.ipynb
@@ -268,6 +268,28 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Loading all of a large database may take a few minutes on some connections. To save time when writing code you may reuse, you can save a local version of the database via the __lazy_read_iiasa__ function. This is given a file location as well as whatever connection options we saw above. The first time the code is run, the result is stored there, and the code will read it from there on subsequent attempts. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lazy_df = pyam.lazy_read_iiasa(\n",
+    "    file=\"./tmp/messageix_co2_coal_data.csv\",\n",
+    "    name=\"iamc15\",\n",
+    "    model='MESSAGEix*', \n",
+    "    variable=['Emissions|CO2', 'Primary Energy|Coal'], \n",
+    "    region='World'\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/pyam/__init__.py b/pyam/__init__.py
@@ -16,7 +16,7 @@
 from pyam.timeseries import *
 from pyam.logging import *
 from pyam.run_control import *
-from pyam.iiasa import read_iiasa  # noqa: F401
+from pyam.iiasa import read_iiasa, lazy_read_iiasa  # noqa: F401
 from pyam.datareader import read_worldbank  # noqa: F401
 from pyam.unfccc import read_unfccc  # noqa: F401
 from pyam.testing import assert_iamframe_equal  # noqa: F401

diff --git a/pyam/iiasa.py b/pyam/iiasa.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 import json
 import logging
+import os.path
 import requests
 
 import httpx
@@ -575,15 +576,16 @@ def read_iiasa(name, default=True, meta=True, creds=None, base_url=_AUTH_URL, **
     ----------
     name : str
         A valid name of an IIASA scenario explorer instance,
-        see :attr:`pyam.iiasa.Connection.valid_connections`
+        see :attr:`pyam.iiasa.Connection.valid_connections`. Obtain a list of options
+        via pyam.iiasa.Connection().valid_connections.
     default : bool, optional
         Return *only* the default version of each scenario.
         Any (`model`, `scenario`) without a default version is omitted.
         If :obj:`False`, return all versions.
     meta : bool or list of strings, optional
         If `True`, include all meta categories & quantitative indicators
         (or subset if list is given).
-    creds : str, :class:`pathlib.Path`, list-like, or dict, optional
+    creds : str or :class:`pathlib.Path`, optional
         | Credentials (username & password) are not required to access
           any public Scenario Explorer instances (i.e., with Guest login).
         | See :class:`pyam.iiasa.Connection` for details.
@@ -595,3 +597,76 @@ def read_iiasa(name, default=True, meta=True, creds=None, base_url=_AUTH_URL, **
         Arguments for :meth:`pyam.iiasa.Connection.query`
     """
     return Connection(name, creds, base_url).query(default=default, meta=meta, **kwargs)
+
+
+def lazy_read_iiasa(
+    file, name, default=True, meta=True, creds=None, base_url=_AUTH_URL, **kwargs
+):
+    """
+    Try to load data from a local cache, failing that, loads it from the internet.
+
+    Check if the file in a given location is an up-to-date version of an IIASA
+    database. If so, load it. If not, load  data from the IIASA scenario explorer
+    database API and save to that location. Does not check that the previously read
+    version is a complete instance of the database, so if the initial load applies a
+    filter, you will read only data that passes the same filter as well as any
+    additional filter you apply.
+
+    Parameters
+    ----------
+    file : str or :class:`pathlib.Path`
+        The location to test for valid data and save the data if not up-to-date. Must be
+        either xls, xlsx or csv.
+    name : str
+        A valid name of an IIASA scenario explorer instance,
+        see :attr:`pyam.iiasa.Connection.valid_connections`. Obtain a list of options
+        via pyam.iiasa.Connection().valid_connections.
+    default : bool, optional
+        Return *only* the default version of each scenario.
+        Any (`model`, `scenario`) without a default version is omitted.
+        If :obj:`False`, return all versions.
+    meta : bool or list of strings, optional
+        If `True`, include all meta categories & quantitative indicators
+        (or subset if list is given).
+    creds : str or :class:`pathlib.Path`, optional
+        | Credentials (username & password) are not required to access
+          any public Scenario Explorer instances (i.e., with Guest login).
+        | See :class:`pyam.iiasa.Connection` for details.
+        | Use :meth:`pyam.iiasa.set_config` to set credentials
+          for accessing private/restricted Scenario Explorer instances.
+    base_url : str
+        Authentication server URL
+    kwargs
+        Arguments for :meth:`pyam.iiasa.Connection.query`
+    """
+
+    file = Path(file)
+    assert file.suffix in [
+        ".csv",
+        ".xlsx",
+        ".xls",
+    ], "We will only read and write to csv, xls and xlsx format."
+    if os.path.exists(file):
+        date_set = pd.to_datetime(os.path.getmtime(file), unit="s")
+        version_info = Connection(name, creds, base_url).properties()
+        latest_new = np.nanmax(pd.to_datetime(version_info["create_date"]))
+        latest_update = np.nanmax(pd.to_datetime(version_info["update_date"]))
+        latest = pd.Series([latest_new, latest_update]).max()
+        if latest < date_set:
+            old_read = IamDataFrame(file)
+            if kwargs:
+                old_read = old_read.filter(**kwargs)
+            logger.info("Database read from file")
+            return old_read
+        else:
+            logger.info("Database out of date and will be re-downloaded")
+    # If we get here, we need to redownload the database
+    new_read = read_iiasa(
+        name, meta=True, default=default, creds=None, base_url=_AUTH_URL, **kwargs
+    )
+    Path(file).parent.mkdir(parents=True, exist_ok=True)
+    if file.suffix == ".csv":
+        new_read.to_csv(file)
+    else:
+        new_read.to_excel(file)
+    return new_read
diff --git a/tests/test_iiasa.py b/tests/test_iiasa.py
@@ -7,7 +7,7 @@
 import numpy.testing as npt
 import yaml
 
-from pyam import IamDataFrame, iiasa, read_iiasa, META_IDX
+from pyam import IamDataFrame, iiasa, lazy_read_iiasa, read_iiasa, META_IDX
 from pyam.testing import assert_iamframe_equal
 
 from .conftest import META_COLS, IIASA_UNAVAILABLE, TEST_API, TEST_API_NAME
@@ -371,3 +371,31 @@ def test_query_empty_response(conn):
     """Check that querying with an empty response returns an empty IamDataFrame"""
     # solves https://github.com/IAMconsortium/pyam/issues/676
     assert conn.query(model="foo").empty
+
+
+def test_lazy_read(tmpdir):
+    tmp_file = tmpdir / "test_database.csv"
+    df = lazy_read_iiasa(tmp_file, TEST_API, model="model_a")
+    writetime = os.path.getmtime(tmp_file)
+    assert df.model == ["model_a"]
+    # This is read from the file, so the filter is not applied.
+    df2 = lazy_read_iiasa(tmp_file, TEST_API)
+    assert df.data.equals(df2.data)
+    # If requesting with an inconsistent filter, get nothing back. Strings and filters
+    # work interchangably.
+    tmp_file = str(tmp_file)
+    df_newfilt = lazy_read_iiasa(tmp_file, TEST_API, model="model_b")
+    assert df_newfilt.empty
+    assert writetime == os.path.getmtime(tmp_file)
+    # Filter correctly applied if the file is deleted
+    os.remove(tmp_file)
+    df_newfilt = lazy_read_iiasa(tmp_file, TEST_API, model="model_b")
+    assert df_newfilt.model == ["model_b"]
+    assert os.path.getmtime(tmp_file) > writetime
+    # file can also be xls or xlsx
+    xlsx_file = tmpdir / "test_database.xlsx"
+    df_xlsx = lazy_read_iiasa(xlsx_file, TEST_API, model="model_b")
+    assert df_newfilt.equals(df_xlsx)
+    xls_file = tmpdir / "test_database.xls"
+    df_xls = lazy_read_iiasa(xls_file, TEST_API, model="model_b")
+    assert df_xls.equals(df_xlsx)