fix: isolate required and optional dependencies (#237)

Allow more minimal installs by making certain packages optional, namely the backend engines (`h5netcdf, `netcdf4`, and `zarr`) used for saving and loading models, plus `statsmodels` which is used for CPCCA-derived models, and `numba` which is used for GWPCA.
xarray-contrib · Oct 7, 2024 · a6c05e6 · a6c05e6
1 parent a6a32a0
commit a6c05e6
Show file tree

Hide file tree

Showing 31 changed files with 177 additions and 67 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   test:
-    name: py${{ matrix.versions.python-version }} ${{ matrix.versions.resolution }}
+    name: py${{ matrix.versions.python-version }} ${{ matrix.versions.resolution }} ${{ matrix.deps.name}}
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -22,6 +22,14 @@ jobs:
             resolution: highest
           - python-version: '3.12'
             resolution: highest
+        deps:
+          - name: minimal
+            value: '[dev]'
+            doctest: '' # doctest runs MCA and requires statsmodels
+          - name: complete
+            value: '[dev,complete]'
+            doctest: '--doctest-glob=README.md'
+
     steps:
       - uses: actions/checkout@v4
 
@@ -33,11 +41,12 @@ jobs:
       - name: Install dependencies
         run: |
           pip install uv
-          uv pip install . -r pyproject.toml --system --extra dev --resolution ${{ matrix.versions.resolution }}
+          uv pip install .${{ matrix.deps.value }} -r pyproject.toml \
+             --system --resolution ${{ matrix.versions.resolution }}
 
       - name: Execute Tests
         run: |
-          coverage run -m pytest -n auto --doctest-glob="README.md"
+          coverage run -m pytest -n auto ${{ matrix.deps.doctest }}
           coverage report -m
           coverage xml
 

diff --git a/docs/content/contributing.rst b/docs/content/contributing.rst
@@ -48,19 +48,9 @@ Using the commands below, prepare your environment:
 
     conda create -n xeofs python=3.11 rpy2 pandoc
     conda activate xeofs
-    pip install -e .[docs,dev]
+    pip install -e .[complete,docs,dev]
 
-This will install all necessary dependencies, including those for development and documentation. If you're only updating the code (without modifying online documentation), you can skip the docs dependency:
-
-.. code-block:: bash
-
-    pip install -e .[dev]
-
-On the other hand, if you're just updating documentation:
-
-.. code-block:: bash
-
-    pip install -e .[docs]
+This will install both core and optional dependencies, including those for specialized models, documentation, and development. Alternatively, you can skip some of the optional dependency sets (``[complete,docs,dev]``) depending on which components of the package you're working on.
 
 Additionally, install the pre-commit hooks:
 
@@ -81,7 +71,7 @@ Before diving into your contribution, ensure your local main branch is updated:
     git fetch upstream
     git merge upstream/main
 
-This syncs your local main branch with the latest from the primary `xeofs` repository.
+This syncs your local main branch with the latest from the primary ``xeofs`` repository.
 
 4. Create a new branch
 ----------------------

diff --git a/docs/content/user_guide/installation.rst b/docs/content/user_guide/installation.rst
@@ -1,35 +1,33 @@
 Installation
 ------------
 
-Required Dependencies
+Dependencies
 ~~~~~~~~~~~~~~~~~~~~~
 
-The following packages are required dependencies:
+The following packages are dependencies of ``xeofs``:
 
-**Core Dependencies**
+**Core Dependencies (Required)**
 
 * Python (3.10 or higher)
-* `numpy <https://www.numpy.org/>`__ 
-* `pandas <https://pandas.pydata.org/>`__ 
-* `xarray <http://xarray.pydata.org/>`__ 
-* `scikit-learn <https://scikit-learn.org/stable/>`__ 
-* `statsmodels <https://www.statsmodels.org/stable/index.html>`__ 
+* `numpy <https://www.numpy.org/>`__
+* `pandas <https://pandas.pydata.org/>`__
+* `xarray <http://xarray.pydata.org/>`__
+* `dask <https://dask.org/>`__
+* `scikit-learn <https://scikit-learn.org/stable/>`__
+* `typing-extensions <https://pypi.org/project/typing-extensions/>`__
+* `tqdm <https://tqdm.github.io/>`__
 
-**For Performance**
+**For Specialized Models (Optional)**
 
-* `dask <https://dask.org/>`__ 
-* `numba <https://numba.pydata.org/>`__ 
+* `numba <https://numba.pydata.org/>`__
+* `statsmodels <https://www.statsmodels.org/stable/index.html>`__
 
-**For I/O**
+**For I/O (Optional)**
 
-* `netCDF4 <https://unidata.github.io/netcdf4-python/netCDF4/index.html>`__ 
-* `zarr <https://zarr.readthedocs.io/en/stable/>`__ 
-* `xarray-datatree <https://github.com/xarray-contrib/datatree>`__
+* `h5netcdf <https://h5netcdf.org/>`__
+* `netCDF4 <https://unidata.github.io/netcdf4-python/netCDF4/index.html>`__
+* `zarr <https://zarr.readthedocs.io/en/stable/>`__
 
-**Miscellaneous**
-
-* `typing-extensions <https://pypi.org/project/typing-extensions/>`__ 
-* `tqdm <https://tqdm.github.io/>`__ 
 
 Instructions
 ~~~~~~~~~~~~
@@ -46,3 +44,17 @@ or the Python package installer `pip <https://pip.pypa.io/en/stable/getting-star
 .. code-block:: bash
 
     pip install xeofs
+
+Several optional dependencies are required for certain functionality and are not installed by default:
+
+* ``zarr``, ``h5netcdf``, or ``netcdf4`` are necessary for saving and loading models to disk
+* ``statsmodels`` is required for all models that inherit from ``CPCCA`` including ``CCA``, ``MCA`` and ``RDA``
+* ``numba`` is required for the ``GWPCA`` model
+
+These extras can be automatically included when installing with pip:
+
+.. code-block:: bash
+
+    pip install xeofs[complete]
+    # or using individual groups
+    pip install xeofs[io,etc]
diff --git a/docs/environment.yml b/docs/environment.yml
@@ -8,4 +8,4 @@ dependencies:
   - pandoc
   - pip
   - pip:
-    - -e ../.[docs]
+    - -e ../.[complete,docs]
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,15 +17,12 @@ dependencies = [
   "scikit-learn>=1.0.2",
   "tqdm>=4.64.0",
   "dask>=2023.0.1",
-  "statsmodels>=0.14.0",
-  "netCDF4>=1.5.8",
-  "numba>=0.57",
   "typing-extensions>=4.8.0",
-  "zarr>=2.14.0",
   "xarray-datatree>=0.0.12",
 ]
 
 [project.optional-dependencies]
+complete = ["xeofs[etc,io]"]
 dev = [
   "build>=1.0.0",
   "ruff>=0.3",
@@ -53,6 +50,15 @@ docs = [
   "ipython>=8.14",
   "ipykernel>=6.23",
 ]
+etc = [
+  "numba>=0.57",
+  "statsmodels>=0.14.0",
+]
+io = [
+  "h5netcdf>=1.0.0",
+  "netcdf4>=1.5.8",
+  "zarr>=2.14.0",
+]
 
 [project.urls]
 homepage = "https://github.com/xarray-contrib/xeofs"

diff --git a/tests/models/cross/__init__.py b/tests/models/cross/__init__.py
@@ -0,0 +1,3 @@
+import pytest
+
+pytest.importorskip("statsmodels")
diff --git a/tests/models/cross/test_cca.py b/tests/models/cross/test_cca.py
@@ -5,6 +5,8 @@
 
 from xeofs.cross import CCA
 
+from ...utilities import skip_if_missing_engine
+
 
 def generate_random_data(shape, lazy=False, seed=142):
     rng = np.random.default_rng(seed)
@@ -226,11 +228,13 @@ def test_predict():
     _ = cca.inverse_transform(Y=Ry_pred)
 
 
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 def test_save_load(tmp_path, engine):
     """Test save/load methods in MCA class, ensuring that we can
     roundtrip the model and get the same results when transforming
     data."""
+    skip_if_missing_engine(engine)
+
     X = generate_random_data((200, 10), seed=123)
     Y = generate_random_data((200, 20), seed=321)
 

diff --git a/tests/models/cross/test_cpcca.py b/tests/models/cross/test_cpcca.py
@@ -5,6 +5,8 @@
 
 from xeofs.cross import CPCCA
 
+from ...utilities import skip_if_missing_engine
+
 
 def generate_random_data(shape, lazy=False, seed=142):
     rng = np.random.default_rng(seed)
@@ -274,12 +276,14 @@ def test_predict():
     _ = cpcca.inverse_transform(Y=Ry_pred)
 
 
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 @pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0])
 def test_save_load(tmp_path, engine, alpha):
     """Test save/load methods in MCA class, ensuring that we can
     roundtrip the model and get the same results when transforming
     data."""
+    skip_if_missing_engine(engine)
+
     X = generate_random_data((200, 10), seed=123)
     Y = generate_random_data((200, 20), seed=321)
 
@@ -319,11 +323,13 @@ def test_save_load(tmp_path, engine, alpha):
     assert np.allclose(XYr_o[1], XYr_l[1])
 
 
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 @pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0])
 def test_save_load_with_data(tmp_path, engine, alpha):
     """Test save/load methods in CPCCA class, ensuring that we can
     roundtrip the model and get the same results for SCF."""
+    skip_if_missing_engine(engine)
+
     X = generate_random_data((200, 10), seed=123)
     Y = generate_random_data((200, 20), seed=321)
 

diff --git a/tests/models/cross/test_hilbert_cpcca.py b/tests/models/cross/test_hilbert_cpcca.py
@@ -5,6 +5,8 @@
 
 from xeofs.cross import HilbertCPCCA
 
+from ...utilities import skip_if_missing_engine
+
 
 def generate_random_data(shape, lazy=False, seed=142):
     rng = np.random.default_rng(seed)
@@ -65,11 +67,13 @@ def test_singular_values(use_pca):
 
 
 # Currently, netCDF4 does not support complex numbers, so skip this test
-@pytest.mark.parametrize("engine", ["zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "zarr"])
 @pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0])
 def test_save_load_with_data(tmp_path, engine, alpha):
     """Test save/load methods in CPCCA class, ensuring that we can
     roundtrip the model and get the same results."""
+    skip_if_missing_engine(engine)
+
     X = generate_random_data((200, 10), seed=123)
     Y = generate_random_data((200, 20), seed=321)
 

diff --git a/tests/models/cross/test_hilbert_mca_rotator.py b/tests/models/cross/test_hilbert_mca_rotator.py
@@ -5,6 +5,8 @@
 # Import the classes from your modules
 from xeofs.cross import HilbertMCA, HilbertMCARotator
 
+from ...utilities import skip_if_missing_engine
+
 
 @pytest.fixture
 def mca_model(mock_data_array, dim):
@@ -242,10 +244,12 @@ def test_scores_phase(mca_model, mock_data_array, dim):
     ],
 )
 # Currently, netCDF4 does not support complex numbers, so skip this test
-@pytest.mark.parametrize("engine", ["zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "zarr"])
 def test_save_load_with_data(tmp_path, engine, mca_model):
     """Test save/load methods in HilbertMCARotator class, ensuring that we can
     roundtrip the model and get the same results."""
+    skip_if_missing_engine(engine)
+
     original = HilbertMCARotator(n_modes=2)
     original.fit(mca_model)
 

diff --git a/tests/models/cross/test_mca.py b/tests/models/cross/test_mca.py
@@ -4,7 +4,7 @@
 
 from xeofs.cross import MCA
 
-from ...utilities import data_is_dask
+from ...utilities import data_is_dask, skip_if_missing_engine
 
 
 @pytest.fixture
@@ -376,11 +376,13 @@ def test_compute(mock_dask_data_array, dim, compute):
         (("lon", "lat")),
     ],
 )
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 def test_save_load(dim, mock_data_array, tmp_path, engine):
     """Test save/load methods in MCA class, ensuring that we can
     roundtrip the model and get the same results when transforming
     data."""
+    skip_if_missing_engine(engine)
+
     original = MCA()
     original.fit(mock_data_array, mock_data_array, dim)
 

diff --git a/tests/models/cross/test_mca_rotator.py b/tests/models/cross/test_mca_rotator.py
@@ -5,7 +5,7 @@
 # Import the classes from your modules
 from xeofs.cross import MCA, MCARotator
 
-from ...utilities import data_is_dask
+from ...utilities import data_is_dask, skip_if_missing_engine
 
 
 @pytest.fixture
@@ -230,11 +230,13 @@ def test_compute(mca_model_delayed, compute):
         (("lon", "lat")),
     ],
 )
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 def test_save_load(dim, mock_data_array, tmp_path, engine):
     """Test save/load methods in MCA class, ensuring that we can
     roundtrip the model and get the same results when transforming
     data."""
+    skip_if_missing_engine(engine)
+
     original_unrotated = MCA()
     original_unrotated.fit(mock_data_array, mock_data_array, dim)
 

diff --git a/tests/models/cross/test_rda.py b/tests/models/cross/test_rda.py
@@ -5,6 +5,8 @@
 
 from xeofs.cross import RDA
 
+from ...utilities import skip_if_missing_engine
+
 
 def generate_random_data(shape, lazy=False, seed=142):
     rng = np.random.default_rng(seed)
@@ -226,11 +228,13 @@ def test_predict():
     _ = rda.inverse_transform(Y=Ry_pred)
 
 
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 def test_save_load(tmp_path, engine):
     """Test save/load methods in MCA class, ensuring that we can
     roundtrip the model and get the same results when transforming
     data."""
+    skip_if_missing_engine(engine)
+
     X = generate_random_data((200, 10), seed=123)
     Y = generate_random_data((200, 20), seed=321)
 

diff --git a/tests/models/single/test_eof.py b/tests/models/single/test_eof.py
@@ -4,6 +4,8 @@
 
 from xeofs.single import EOF
 
+from ...utilities import skip_if_missing_engine
+
 
 def test_init():
     """Tests the initialization of the EOF class"""
@@ -494,11 +496,13 @@ def test_inverse_transform(dim, mock_data_array, normalized):
         (("lon", "lat")),
     ],
 )
-@pytest.mark.parametrize("engine", ["netcdf4", "zarr"])
+@pytest.mark.parametrize("engine", ["h5netcdf", "netcdf4", "zarr"])
 def test_save_load(dim, mock_data_array, tmp_path, engine):
     """Test save/load methods in EOF class, ensuring that we can
     roundtrip the model and get the same results when transforming
     data."""
+    skip_if_missing_engine(engine)
+
     original = EOF()
     original.fit(mock_data_array, dim)