Skip to content

Commit

Permalink
Wrap gmtselect
Browse files Browse the repository at this point in the history
Initial commit for wrapping the gmtselect function for #1427
which selects data table subsets based on multiple spatial
criteria. Original GMT `gmtselect` documentation is at
https://docs.generic-mapping-tools.org/6.2/gmtselect.html.
Aliased non-common optional parameters reverse (I) and
z_subregion (Z).
  • Loading branch information
weiji14 committed Aug 10, 2021
1 parent 85d78d6 commit 253163b
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ Operations on tabular data:

blockmean
blockmedian
select
surface

Operations on grids:
Expand Down
1 change: 1 addition & 0 deletions pygmt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
grdtrack,
info,
makecpt,
select,
surface,
which,
x2sys_cross,
Expand Down
1 change: 1 addition & 0 deletions pygmt/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from pygmt.src.plot import plot
from pygmt.src.plot3d import plot3d
from pygmt.src.rose import rose
from pygmt.src.select import select
from pygmt.src.solar import solar
from pygmt.src.subplot import set_panel, subplot
from pygmt.src.surface import surface
Expand Down
144 changes: 144 additions & 0 deletions pygmt/src/select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
select - Select data table subsets based on multiple spatial criteria.
"""
import pandas as pd
from pygmt.clib import Session
from pygmt.helpers import (
GMTTempFile,
build_arg_string,
fmt_docstring,
kwargs_to_strings,
use_alias,
)


@fmt_docstring
@use_alias(
I="reverse",
J="projection",
R="region",
V="verbose",
Z="z_subregion",
b="binary",
d="nodata",
e="find",
f="coltypes",
g="gap",
h="header",
i="incols",
o="outcols",
r="registration",
s="skiprows",
w="wrap",
)
@kwargs_to_strings(R="sequence")
def select(table=None, outfile=None, **kwargs):
r"""
Select data table subsets based on multiple spatial criteria.
This is a filter that reads (x, y) or (longitude, latitude) positions from
the first 2 columns of *table* and uses a combination of 1-7 criteria to
pass or reject the records. Records can be selected based on whether or not
they are:
1. inside a rectangular region (**region** [and **projection**])
2. within *dist* km of any point in *pointfile*
3. within *dist* km of any line in *linefile*
4. inside one of the polygons in the *polygonfile*
5. inside geographical features (based on coastlines)
6. has z-values within a given range, or
7. inside bins of a grid mask whose nodes are non-zero
The sense of the tests can be reversed for each of these 7 criteria by
using the **reverse** option.
Full option list at :gmt-docs:`gmtselect.html`
{aliases}
Parameters
----------
table : str or {table-like}
Pass in either a file name to an ASCII data table, a 2D
{table-classes}.
outfile : str
The file name for the output ASCII file.
reverse : str
[**cflrsz**].
Reverses the sense of the test for each of the criteria specified:
- **c** select records NOT inside any point's circle of influence.
- **f** select records NOT inside any of the polygons.
- **g** will pass records inside the cells with z equal zero of the
grid mask in **-G**.
- **l** select records NOT within the specified distance of any line.
- **r** select records NOT inside the specified rectangular region.
- **s** select records NOT considered inside as specified by **-N**
(and **-A**, **-D**).
- **z** select records NOT within the range specified by
**z_subregion**.
z_subregion : str
*min*\ [/*max*]\ [**+a**]\ [**+c**\ *col*]\ [**+i**].
Pass all records whose 3rd column (*z*; *col* = 2) lies within the
given range or is NaN (use **skiprows** to skip NaN records). If *max*
is omitted then we test if *z* equals *min* instead. This means
equality within 5 ULPs (unit of least precision;
http://en.wikipedia.org/wiki/Unit_in_the_last_place). Input file must
have at least three columns. To indicate no limit on min or max,
specify a hyphen (-). If your 3rd column is absolute time then remember
to supply ``coltypes="2T"``. To specify another column, append
**+c**\ *col*, and to specify several tests just repeat the
**z_subregion** option as many times as you have columns to test.
**Note**: When more than one **z_subregion** option is given then the
``reverse="z"`` option cannot be used. In the case of multiple tests
you may use these modifiers as well: **+a** passes any record that
passes at least one of your *z* tests [Default is all tests must pass],
and **+i** reverses the tests to pass record with *z* value NOT in the
given range. Finally, if **+c** is not used then it is automatically
incremented for each new **z_subregion** option, starting with 2.
{J}
{R}
{V}
{b}
{d}
{e}
{f}
{g}
{h}
{i}
{o}
{r}
{s}
{w}
Returns
-------
output : pandas.DataFrame or None
Return type depends on whether the ``outfile`` parameter is set:
- :class:`pandas.DataFrame` table if ``outfile`` is not set.
- None if ``outfile`` is set (filtered output will be stored in file
set by ``outfile``).
"""

with GMTTempFile(suffix=".csv") as tmpfile:
with Session() as lib:
# Choose how data will be passed into the module
table_context = lib.virtualfile_from_data(check_kind="vector", data=table)
with table_context as infile:
if outfile is None:
outfile = tmpfile.name
arg_str = " ".join([infile, build_arg_string(kwargs), "->" + outfile])
lib.call_module(module="gmtselect", args=arg_str)

# Read temporary csv output to a pandas table
if outfile == tmpfile.name: # if user did not set outfile, return pd.DataFrame
try:
column_names = table.columns.to_list()
result = pd.read_csv(tmpfile.name, sep="\t", names=column_names)
except AttributeError: # 'str' object has no attribute 'columns'
result = pd.read_csv(tmpfile.name, sep="\t", header=None, comment=">")
elif outfile != tmpfile.name: # return None if outfile set, output in outfile
result = None

return result
65 changes: 65 additions & 0 deletions pygmt/tests/test_select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Tests for select.
"""
import os

import numpy.testing as npt
import pandas as pd
import pytest
from pygmt import select
from pygmt.datasets import load_sample_bathymetry
from pygmt.exceptions import GMTInvalidInput
from pygmt.helpers import GMTTempFile, data_kind


@pytest.fixture(scope="module", name="dataframe")
def fixture_dataframe():
"""
Load the table data from the sample bathymetry dataset.
"""
return load_sample_bathymetry()


def test_select_input_dataframe(dataframe):
"""
Run select by passing in a pandas.DataFrame as input.
"""
output = select(table=dataframe, region=[250, 251, 26, 27])
assert isinstance(output, pd.DataFrame)
assert all(dataframe.columns == output.columns)
assert output.shape == (65, 3)
npt.assert_allclose(output.median(), [250.31464, 26.33893, -270.0])


def test_select_input_table_matrix(dataframe):
"""
Run select using table input that is not a pandas.DataFrame but still a
matrix.
Also testing the reverse (I) alias.
"""
table = dataframe.values
output = select(table=table, region=[245.5, 254.5, 20.5, 29.5], reverse="r")
assert isinstance(output, pd.DataFrame)
assert output.shape == (9177, 3)
npt.assert_allclose(output.median(), [247.235, 20.48624, -3241.0])


def test_select_input_filename():
"""
Run select by passing in an ASCII text file as input.
Also testing the z_subregion (Z) alias.
"""
with GMTTempFile() as tmpfile:
output = select(
table="@tut_ship.xyz",
region=[250, 251, 26, 27],
z_subregion=["-/-630", "-120/0+a"],
outfile=tmpfile.name,
)
assert output is None # check that output is None since outfile is set
assert os.path.exists(path=tmpfile.name)
output = pd.read_csv(tmpfile.name, sep="\t", header=None)
assert output.shape == (5, 3)
npt.assert_allclose(output.median(), [250.12149, 26.04296, -674.0])

0 comments on commit 253163b

Please sign in to comment.