sgkit-dev · jeromekelleher · Sep 28, 2020 · Sep 23, 2020 · Sep 25, 2020 · Sep 28, 2020
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -21,8 +21,10 @@ jobs:
     - name: Install dependencies
       # activate conda
       shell: bash -l {0}
+      # conda can't install all dev tools, so we need to split it between conda and pip
       run: |
-        conda install --file requirements.txt --file requirements-dev.txt msprime
+        conda install --file requirements.txt msprime
+        pip install -r requirements-dev.txt
     - name: Test with pytest and coverage
       # activate conda
       shell: bash -l {0}

diff --git a/docs/api.rst b/docs/api.rst
@@ -1,11 +1,26 @@
-.. currentmodule:: sgkit
-
 #############
 API reference
 #############
 
 This page provides an auto-generated summary of sgkits's API.
 
+IO/imports
+==========
+
+.. currentmodule:: sgkit.io.plink
+.. autosummary::
+   :toctree: generated/
+
+    read_plink
+
+.. currentmodule:: sgkit
+.. autosummary::
+   :toctree: generated/
+
+    read_vcfzarr
+
+.. currentmodule:: sgkit
+
 Creating a dataset
 ==================
 
@@ -14,7 +29,6 @@ Creating a dataset
 
    create_genotype_call_dataset
    create_genotype_dosage_dataset
-   read_vcfzarr
 
 Methods
 =======

diff --git a/docs/index.rst b/docs/index.rst
@@ -2,12 +2,13 @@ sgkit: Statistical genetics toolkit in Python
 =============================================
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+    :maxdepth: 2
+    :caption: Contents:
 
-   api
-   usage
-   contributing
+    api
+    usage
+    io
+    contributing
 
 
 Indices and tables

diff --git a/docs/io.rst b/docs/io.rst
@@ -0,0 +1,16 @@
+.. _io:
+
+IOs
+===
+
+PLINK
+-----
+
+The :func:`sgkit.io.plink.read_plink` loads a single PLINK dataset as Dask
+arrays within an `xr.Dataset` from bed, bim, and fam files.
+
+PLINK IO support is an "extra" feature within sgkit and requires additional
+dependencies. To install sgkit with PLINK support using pip::
+
+    $ pip install git+https://github.com/pystatgen/sgkit#egg=sgkit[plink]
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -8,3 +8,6 @@ statsmodels
 zarr
 msprime
 scikit-learn
+partd
+fsspec
+bed-reader
diff --git a/setup.cfg b/setup.cfg
@@ -36,7 +36,24 @@ install_requires =
 setup_requires =
     setuptools >= 41.2
     setuptools_scm
-
+
+[options.extras_require]
+# For plink we need dask[dataframe], we already have
+# dask[array] in install_requires, and since
+# https://github.com/pypa/pip/issues/4957, pip
+# will essentially ignore dask[dataframe] in the extras.
+# We can workaround this by either adding pip flag
+# --use-feature 2020-resolver, or installing
+# dask[dataframe] in the install_requires, or just listing
+# the 2 missing dependencies from dataframe, the way we do
+# here, when pip finally gets a resolver, this won't be
+# a problem. Here we opt for listing the 2 dependencies
+# since this is the least user invasive solution.
+plink =
+    partd
+    fsspec
+    bed-reader
+
 [coverage:report]
 fail_under = 100
 
@@ -92,9 +109,11 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 [mypy-sklearn.*]
 ignore_missing_imports = True
+[mypy-bed_reader.*]
+ignore_missing_imports = True
 [mypy-sgkit.*]
 allow_redefinition = True
-[mypy-sgkit.tests.*]
+[mypy-sgkit.*.tests.*]
 disallow_untyped_defs = False
 disallow_untyped_decorators = False
 [mypy-validation.*]

diff --git a/sgkit/io/plink/__init__.py b/sgkit/io/plink/__init__.py
@@ -0,0 +1,11 @@
+try:
+    from .plink_reader import read_plink  # noqa: F401
+
+    __all__ = ["read_plink"]
+except ImportError as e:
+    msg = (
+        "sgkit-plink requirements are not installed.\n\n"
+        "Please install them via pip :\n\n"
+        "  pip install 'git+https://github.com/pystatgen/sgkit#egg=sgkit[plink]'"
+    )
+    raise ImportError(str(e) + "\n\n" + msg) from e