Wrap gmtselect

Initial commit for wrapping the gmtselect function for #1427 which selects data table subsets based on multiple spatial criteria. Original GMT `gmtselect` documentation is at https://docs.generic-mapping-tools.org/6.2/gmtselect.html. Aliased non-common optional parameters reverse (I) and z_subregion (Z).
GenericMappingTools · Aug 10, 2021 · 253163b · 253163b
1 parent 85d78d6
commit 253163b
Show file tree

Hide file tree

Showing 5 changed files with 212 additions and 0 deletions.
diff --git a/doc/api/index.rst b/doc/api/index.rst
@@ -81,6 +81,7 @@ Operations on tabular data:
 
     blockmean
     blockmedian
+    select
     surface
 
 Operations on grids:

diff --git a/pygmt/__init__.py b/pygmt/__init__.py
@@ -44,6 +44,7 @@
     grdtrack,
     info,
     makecpt,
+    select,
     surface,
     which,
     x2sys_cross,

diff --git a/pygmt/src/__init__.py b/pygmt/src/__init__.py
@@ -33,6 +33,7 @@
 from pygmt.src.plot import plot
 from pygmt.src.plot3d import plot3d
 from pygmt.src.rose import rose
+from pygmt.src.select import select
 from pygmt.src.solar import solar
 from pygmt.src.subplot import set_panel, subplot
 from pygmt.src.surface import surface

diff --git a/pygmt/src/select.py b/pygmt/src/select.py
@@ -0,0 +1,144 @@
+"""
+select - Select data table subsets based on multiple spatial criteria.
+"""
+import pandas as pd
+from pygmt.clib import Session
+from pygmt.helpers import (
+    GMTTempFile,
+    build_arg_string,
+    fmt_docstring,
+    kwargs_to_strings,
+    use_alias,
+)
+
+
+@fmt_docstring
+@use_alias(
+    I="reverse",
+    J="projection",
+    R="region",
+    V="verbose",
+    Z="z_subregion",
+    b="binary",
+    d="nodata",
+    e="find",
+    f="coltypes",
+    g="gap",
+    h="header",
+    i="incols",
+    o="outcols",
+    r="registration",
+    s="skiprows",
+    w="wrap",
+)
+@kwargs_to_strings(R="sequence")
+def select(table=None, outfile=None, **kwargs):
+    r"""
+    Select data table subsets based on multiple spatial criteria.
+
+    This is a filter that reads (x, y) or (longitude, latitude) positions from
+    the first 2 columns of *table* and uses a combination of 1-7 criteria to
+    pass or reject the records. Records can be selected based on whether or not
+    they are:
+
+    1. inside a rectangular region (**region** [and **projection**])
+    2. within *dist* km of any point in *pointfile*
+    3. within *dist* km of any line in *linefile*
+    4. inside one of the polygons in the *polygonfile*
+    5. inside geographical features (based on coastlines)
+    6. has z-values within a given range, or
+    7. inside bins of a grid mask whose nodes are non-zero
+
+    The sense of the tests can be reversed for each of these 7 criteria by
+    using the **reverse** option.
+
+    Full option list at :gmt-docs:`gmtselect.html`
+
+    {aliases}
+
+    Parameters
+    ----------
+    table : str or {table-like}
+        Pass in either a file name to an ASCII data table, a 2D
+        {table-classes}.
+    outfile : str
+        The file name for the output ASCII file.
+    reverse : str
+        [**cflrsz**].
+        Reverses the sense of the test for each of the criteria specified:
+
+        - **c** select records NOT inside any point's circle of influence.
+        - **f** select records NOT inside any of the polygons.
+        - **g** will pass records inside the cells with z equal zero of the
+          grid mask in **-G**.
+        - **l** select records NOT within the specified distance of any line.
+        - **r** select records NOT inside the specified rectangular region.
+        - **s** select records NOT considered inside as specified by **-N**
+          (and **-A**, **-D**).
+        - **z** select records NOT within the range specified by
+          **z_subregion**.
+    z_subregion : str
+        *min*\ [/*max*]\ [**+a**]\ [**+c**\ *col*]\ [**+i**].
+        Pass all records whose 3rd column (*z*; *col* = 2) lies within the
+        given range or is NaN (use **skiprows** to skip NaN records). If *max*
+        is omitted then we test if *z* equals *min* instead. This means
+        equality within 5 ULPs (unit of least precision;
+        http://en.wikipedia.org/wiki/Unit_in_the_last_place). Input file must
+        have at least three columns. To indicate no limit on min or max,
+        specify a hyphen (-). If your 3rd column is absolute time then remember
+        to supply ``coltypes="2T"``. To specify another column, append
+        **+c**\ *col*, and to specify several tests just repeat the
+        **z_subregion** option as many times as you have columns to test.
+        **Note**: When more than one **z_subregion** option is given then the
+        ``reverse="z"`` option cannot be used. In the case of multiple tests
+        you may use these modifiers as well: **+a** passes any record that
+        passes at least one of your *z* tests [Default is all tests must pass],
+        and **+i** reverses the tests to pass record with *z* value NOT in the
+        given range. Finally, if **+c** is not used then it is automatically
+        incremented for each new **z_subregion** option, starting with 2.
+    {J}
+    {R}
+    {V}
+    {b}
+    {d}
+    {e}
+    {f}
+    {g}
+    {h}
+    {i}
+    {o}
+    {r}
+    {s}
+    {w}
+
+    Returns
+    -------
+    output : pandas.DataFrame or None
+        Return type depends on whether the ``outfile`` parameter is set:
+
+        - :class:`pandas.DataFrame` table if ``outfile`` is not set.
+        - None if ``outfile`` is set (filtered output will be stored in file
+          set by ``outfile``).
+    """
+
+    with GMTTempFile(suffix=".csv") as tmpfile:
+        with Session() as lib:
+            # Choose how data will be passed into the module
+            table_context = lib.virtualfile_from_data(check_kind="vector", data=table)
+            with table_context as infile:
+                if outfile is None:
+                    outfile = tmpfile.name
+                arg_str = " ".join([infile, build_arg_string(kwargs), "->" + outfile])
+                lib.call_module(module="gmtselect", args=arg_str)
+
+        # Read temporary csv output to a pandas table
+        if outfile == tmpfile.name:  # if user did not set outfile, return pd.DataFrame
+            try:
+                column_names = table.columns.to_list()
+                result = pd.read_csv(tmpfile.name, sep="\t", names=column_names)
+            except AttributeError:  # 'str' object has no attribute 'columns'
+                result = pd.read_csv(tmpfile.name, sep="\t", header=None, comment=">")
+        elif outfile != tmpfile.name:  # return None if outfile set, output in outfile
+            result = None
+
+    return result
diff --git a/pygmt/tests/test_select.py b/pygmt/tests/test_select.py
@@ -0,0 +1,65 @@
+"""
+Tests for select.
+"""
+import os
+
+import numpy.testing as npt
+import pandas as pd
+import pytest
+from pygmt import select
+from pygmt.datasets import load_sample_bathymetry
+from pygmt.exceptions import GMTInvalidInput
+from pygmt.helpers import GMTTempFile, data_kind
+
+
+@pytest.fixture(scope="module", name="dataframe")
+def fixture_dataframe():
+    """
+    Load the table data from the sample bathymetry dataset.
+    """
+    return load_sample_bathymetry()
+
+
+def test_select_input_dataframe(dataframe):
+    """
+    Run select by passing in a pandas.DataFrame as input.
+    """
+    output = select(table=dataframe, region=[250, 251, 26, 27])
+    assert isinstance(output, pd.DataFrame)
+    assert all(dataframe.columns == output.columns)
+    assert output.shape == (65, 3)
+    npt.assert_allclose(output.median(), [250.31464, 26.33893, -270.0])
+
+
+def test_select_input_table_matrix(dataframe):
+    """
+    Run select using table input that is not a pandas.DataFrame but still a
+    matrix.
+
+    Also testing the reverse (I) alias.
+    """
+    table = dataframe.values
+    output = select(table=table, region=[245.5, 254.5, 20.5, 29.5], reverse="r")
+    assert isinstance(output, pd.DataFrame)
+    assert output.shape == (9177, 3)
+    npt.assert_allclose(output.median(), [247.235, 20.48624, -3241.0])
+
+
+def test_select_input_filename():
+    """
+    Run select by passing in an ASCII text file as input.
+
+    Also testing the z_subregion (Z) alias.
+    """
+    with GMTTempFile() as tmpfile:
+        output = select(
+            table="@tut_ship.xyz",
+            region=[250, 251, 26, 27],
+            z_subregion=["-/-630", "-120/0+a"],
+            outfile=tmpfile.name,
+        )
+        assert output is None  # check that output is None since outfile is set
+        assert os.path.exists(path=tmpfile.name)
+        output = pd.read_csv(tmpfile.name, sep="\t", header=None)
+        assert output.shape == (5, 3)
+        npt.assert_allclose(output.median(), [250.12149, 26.04296, -674.0])