From 53ef11943549b4295672a1870b21a7c63c3e672c Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Tue, 10 Aug 2021 23:29:05 +1200 Subject: [PATCH 1/7] Wrap gmtselect Initial commit for wrapping the gmtselect function for #1427 which selects data table subsets based on multiple spatial criteria. Original GMT `gmtselect` documentation is at https://docs.generic-mapping-tools.org/6.2/gmtselect.html. Aliased non-common optional parameters reverse (I) and z_subregion (Z). --- doc/api/index.rst | 1 + pygmt/__init__.py | 1 + pygmt/src/__init__.py | 1 + pygmt/src/select.py | 144 +++++++++++++++++++++++++++++++++++++ pygmt/tests/test_select.py | 64 +++++++++++++++++ 5 files changed, 211 insertions(+) create mode 100644 pygmt/src/select.py create mode 100644 pygmt/tests/test_select.py diff --git a/doc/api/index.rst b/doc/api/index.rst index 3536e83f640..89ba7e50594 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -81,6 +81,7 @@ Operations on tabular data: blockmean blockmedian + select surface Operations on grids: diff --git a/pygmt/__init__.py b/pygmt/__init__.py index 25ad96a2127..7bff159d4d2 100644 --- a/pygmt/__init__.py +++ b/pygmt/__init__.py @@ -44,6 +44,7 @@ grdtrack, info, makecpt, + select, surface, which, x2sys_cross, diff --git a/pygmt/src/__init__.py b/pygmt/src/__init__.py index 2e1f8b56186..ae7ca41af2e 100644 --- a/pygmt/src/__init__.py +++ b/pygmt/src/__init__.py @@ -33,6 +33,7 @@ from pygmt.src.plot import plot from pygmt.src.plot3d import plot3d from pygmt.src.rose import rose +from pygmt.src.select import select from pygmt.src.solar import solar from pygmt.src.subplot import set_panel, subplot from pygmt.src.surface import surface diff --git a/pygmt/src/select.py b/pygmt/src/select.py new file mode 100644 index 00000000000..e4ccf8a1a67 --- /dev/null +++ b/pygmt/src/select.py @@ -0,0 +1,144 @@ +""" +select - Select data table subsets based on multiple spatial criteria. +""" +import pandas as pd +from pygmt.clib import Session +from pygmt.helpers import ( + GMTTempFile, + build_arg_string, + fmt_docstring, + kwargs_to_strings, + use_alias, +) + + +@fmt_docstring +@use_alias( + I="reverse", + J="projection", + R="region", + V="verbose", + Z="z_subregion", + b="binary", + d="nodata", + e="find", + f="coltypes", + g="gap", + h="header", + i="incols", + o="outcols", + r="registration", + s="skiprows", + w="wrap", +) +@kwargs_to_strings(R="sequence") +def select(table=None, outfile=None, **kwargs): + r""" + Select data table subsets based on multiple spatial criteria. + + This is a filter that reads (x, y) or (longitude, latitude) positions from + the first 2 columns of *table* and uses a combination of 1-7 criteria to + pass or reject the records. Records can be selected based on whether or not + they are: + + 1. inside a rectangular region (**region** [and **projection**]) + 2. within *dist* km of any point in *pointfile* + 3. within *dist* km of any line in *linefile* + 4. inside one of the polygons in the *polygonfile* + 5. inside geographical features (based on coastlines) + 6. has z-values within a given range, or + 7. inside bins of a grid mask whose nodes are non-zero + + The sense of the tests can be reversed for each of these 7 criteria by + using the **reverse** option. + + Full option list at :gmt-docs:`gmtselect.html` + + {aliases} + + Parameters + ---------- + table : str or {table-like} + Pass in either a file name to an ASCII data table, a 2D + {table-classes}. + outfile : str + The file name for the output ASCII file. + reverse : str + [**cflrsz**]. + Reverses the sense of the test for each of the criteria specified: + + - **c** select records NOT inside any point's circle of influence. + - **f** select records NOT inside any of the polygons. + - **g** will pass records inside the cells with z equal zero of the + grid mask in **-G**. + - **l** select records NOT within the specified distance of any line. + - **r** select records NOT inside the specified rectangular region. + - **s** select records NOT considered inside as specified by **-N** + (and **-A**, **-D**). + - **z** select records NOT within the range specified by + **z_subregion**. + z_subregion : str + *min*\ [/*max*]\ [**+a**]\ [**+c**\ *col*]\ [**+i**]. + Pass all records whose 3rd column (*z*; *col* = 2) lies within the + given range or is NaN (use **skiprows** to skip NaN records). If *max* + is omitted then we test if *z* equals *min* instead. This means + equality within 5 ULPs (unit of least precision; + http://en.wikipedia.org/wiki/Unit_in_the_last_place). Input file must + have at least three columns. To indicate no limit on min or max, + specify a hyphen (-). If your 3rd column is absolute time then remember + to supply ``coltypes="2T"``. To specify another column, append + **+c**\ *col*, and to specify several tests just repeat the + **z_subregion** option as many times as you have columns to test. + **Note**: When more than one **z_subregion** option is given then the + ``reverse="z"`` option cannot be used. In the case of multiple tests + you may use these modifiers as well: **+a** passes any record that + passes at least one of your *z* tests [Default is all tests must pass], + and **+i** reverses the tests to pass record with *z* value NOT in the + given range. Finally, if **+c** is not used then it is automatically + incremented for each new **z_subregion** option, starting with 2. + {J} + {R} + {V} + {b} + {d} + {e} + {f} + {g} + {h} + {i} + {o} + {r} + {s} + {w} + + Returns + ------- + output : pandas.DataFrame or None + Return type depends on whether the ``outfile`` parameter is set: + + - :class:`pandas.DataFrame` table if ``outfile`` is not set. + - None if ``outfile`` is set (filtered output will be stored in file + set by ``outfile``). + """ + + with GMTTempFile(suffix=".csv") as tmpfile: + with Session() as lib: + # Choose how data will be passed into the module + table_context = lib.virtualfile_from_data(check_kind="vector", data=table) + with table_context as infile: + if outfile is None: + outfile = tmpfile.name + arg_str = " ".join([infile, build_arg_string(kwargs), "->" + outfile]) + lib.call_module(module="gmtselect", args=arg_str) + + # Read temporary csv output to a pandas table + if outfile == tmpfile.name: # if user did not set outfile, return pd.DataFrame + try: + column_names = table.columns.to_list() + result = pd.read_csv(tmpfile.name, sep="\t", names=column_names) + except AttributeError: # 'str' object has no attribute 'columns' + result = pd.read_csv(tmpfile.name, sep="\t", header=None, comment=">") + elif outfile != tmpfile.name: # return None if outfile set, output in outfile + result = None + + return result diff --git a/pygmt/tests/test_select.py b/pygmt/tests/test_select.py new file mode 100644 index 00000000000..16169c158d7 --- /dev/null +++ b/pygmt/tests/test_select.py @@ -0,0 +1,64 @@ +""" +Tests for select. +""" +import os + +import numpy.testing as npt +import pandas as pd +import pytest +from pygmt import select +from pygmt.datasets import load_sample_bathymetry +from pygmt.helpers import GMTTempFile + + +@pytest.fixture(scope="module", name="dataframe") +def fixture_dataframe(): + """ + Load the table data from the sample bathymetry dataset. + """ + return load_sample_bathymetry() + + +def test_select_input_dataframe(dataframe): + """ + Run select by passing in a pandas.DataFrame as input. + """ + output = select(table=dataframe, region=[250, 251, 26, 27]) + assert isinstance(output, pd.DataFrame) + assert all(dataframe.columns == output.columns) + assert output.shape == (65, 3) + npt.assert_allclose(output.median(), [250.31464, 26.33893, -270.0]) + + +def test_select_input_table_matrix(dataframe): + """ + Run select using table input that is not a pandas.DataFrame but still a + matrix. + + Also testing the reverse (I) alias. + """ + table = dataframe.values + output = select(table=table, region=[245.5, 254.5, 20.5, 29.5], reverse="r") + assert isinstance(output, pd.DataFrame) + assert output.shape == (9177, 3) + npt.assert_allclose(output.median(), [247.235, 20.48624, -3241.0]) + + +def test_select_input_filename(): + """ + Run select by passing in an ASCII text file as input. + + Also testing the z_subregion (Z) alias. + """ + with GMTTempFile() as tmpfile: + output = select( + table="@tut_ship.xyz", + region=[250, 251, 26, 27], + z_subregion=["-/-630", "-120/0+a"], + outfile=tmpfile.name, + ) + assert output is None # check that output is None since outfile is set + assert os.path.exists(path=tmpfile.name) + output = pd.read_csv(tmpfile.name, sep="\t", header=None) + assert output.shape == (5, 3) + npt.assert_allclose(output.median(), [250.12149, 26.04296, -674.0]) From 35ea6b82f9b1696d5666f195511947a6db9b850d Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Thu, 12 Aug 2021 22:13:26 +1200 Subject: [PATCH 2/7] Alias area_thresh (A) for select --- pygmt/src/select.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pygmt/src/select.py b/pygmt/src/select.py index e4ccf8a1a67..4238c5255e2 100644 --- a/pygmt/src/select.py +++ b/pygmt/src/select.py @@ -14,6 +14,7 @@ @fmt_docstring @use_alias( + A="area_thresh", I="reverse", J="projection", R="region", @@ -63,6 +64,7 @@ def select(table=None, outfile=None, **kwargs): {table-classes}. outfile : str The file name for the output ASCII file. + {A} reverse : str [**cflrsz**]. Reverses the sense of the test for each of the criteria specified: @@ -77,6 +79,9 @@ def select(table=None, outfile=None, **kwargs): (and **-A**, **-D**). - **z** select records NOT within the range specified by **z_subregion**. + {J} + {R} + {V} z_subregion : str *min*\ [/*max*]\ [**+a**]\ [**+c**\ *col*]\ [**+i**]. Pass all records whose 3rd column (*z*; *col* = 2) lies within the @@ -96,9 +101,6 @@ def select(table=None, outfile=None, **kwargs): and **+i** reverses the tests to pass record with *z* value NOT in the given range. Finally, if **+c** is not used then it is automatically incremented for each new **z_subregion** option, starting with 2. - {J} - {R} - {V} {b} {d} {e} From f8051ad1dc8a0333bbe2935627b4dcf6b6949ae6 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Mon, 27 Sep 2021 21:27:50 +1300 Subject: [PATCH 3/7] Rename 'table' parameter to 'data' --- pygmt/src/select.py | 10 +++++----- pygmt/tests/test_select.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pygmt/src/select.py b/pygmt/src/select.py index 4238c5255e2..e166df08bc7 100644 --- a/pygmt/src/select.py +++ b/pygmt/src/select.py @@ -33,12 +33,12 @@ w="wrap", ) @kwargs_to_strings(R="sequence") -def select(table=None, outfile=None, **kwargs): +def select(data=None, outfile=None, **kwargs): r""" Select data table subsets based on multiple spatial criteria. This is a filter that reads (x, y) or (longitude, latitude) positions from - the first 2 columns of *table* and uses a combination of 1-7 criteria to + the first 2 columns of *data* and uses a combination of 1-7 criteria to pass or reject the records. Records can be selected based on whether or not they are: @@ -59,7 +59,7 @@ def select(table=None, outfile=None, **kwargs): Parameters ---------- - table : str or {table-like} + data : str or {table-like} Pass in either a file name to an ASCII data table, a 2D {table-classes}. outfile : str @@ -126,7 +126,7 @@ def select(table=None, outfile=None, **kwargs): with GMTTempFile(suffix=".csv") as tmpfile: with Session() as lib: # Choose how data will be passed into the module - table_context = lib.virtualfile_from_data(check_kind="vector", data=table) + table_context = lib.virtualfile_from_data(check_kind="vector", data=data) with table_context as infile: if outfile is None: outfile = tmpfile.name @@ -136,7 +136,7 @@ def select(table=None, outfile=None, **kwargs): # Read temporary csv output to a pandas table if outfile == tmpfile.name: # if user did not set outfile, return pd.DataFrame try: - column_names = table.columns.to_list() + column_names = data.columns.to_list() result = pd.read_csv(tmpfile.name, sep="\t", names=column_names) except AttributeError: # 'str' object has no attribute 'columns' result = pd.read_csv(tmpfile.name, sep="\t", header=None, comment=">") diff --git a/pygmt/tests/test_select.py b/pygmt/tests/test_select.py index 16169c158d7..fe18912f608 100644 --- a/pygmt/tests/test_select.py +++ b/pygmt/tests/test_select.py @@ -23,7 +23,7 @@ def test_select_input_dataframe(dataframe): """ Run select by passing in a pandas.DataFrame as input. """ - output = select(table=dataframe, region=[250, 251, 26, 27]) + output = select(data=dataframe, region=[250, 251, 26, 27]) assert isinstance(output, pd.DataFrame) assert all(dataframe.columns == output.columns) assert output.shape == (65, 3) @@ -37,8 +37,8 @@ def test_select_input_table_matrix(dataframe): Also testing the reverse (I) alias. """ - table = dataframe.values - output = select(table=table, region=[245.5, 254.5, 20.5, 29.5], reverse="r") + data = dataframe.values + output = select(data=data, region=[245.5, 254.5, 20.5, 29.5], reverse="r") assert isinstance(output, pd.DataFrame) assert output.shape == (9177, 3) npt.assert_allclose(output.median(), [247.235, 20.48624, -3241.0]) @@ -52,7 +52,7 @@ def test_select_input_filename(): """ with GMTTempFile() as tmpfile: output = select( - table="@tut_ship.xyz", + data="@tut_ship.xyz", region=[250, 251, 26, 27], z_subregion=["-/-630", "-120/0+a"], outfile=tmpfile.name, From b6a858494b2eb2e06732294b7c2ef5f507c0d841 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Oct 2021 22:24:15 +1300 Subject: [PATCH 4/7] Parse incols and outcols arguments using sequence_comma --- pygmt/src/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygmt/src/select.py b/pygmt/src/select.py index e166df08bc7..864047ad8a6 100644 --- a/pygmt/src/select.py +++ b/pygmt/src/select.py @@ -32,7 +32,7 @@ s="skiprows", w="wrap", ) -@kwargs_to_strings(R="sequence") +@kwargs_to_strings(R="sequence", i="sequence_comma", o="sequence_comma") def select(data=None, outfile=None, **kwargs): r""" Select data table subsets based on multiple spatial criteria. From d2902eee90ef61283fe11d1dd549fe0133af8fef Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Oct 2021 23:52:28 +1300 Subject: [PATCH 5/7] Use area_thresh instead of -A in docstring for reverse param --- pygmt/src/select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygmt/src/select.py b/pygmt/src/select.py index 864047ad8a6..2e13169bf66 100644 --- a/pygmt/src/select.py +++ b/pygmt/src/select.py @@ -76,7 +76,7 @@ def select(data=None, outfile=None, **kwargs): - **l** select records NOT within the specified distance of any line. - **r** select records NOT inside the specified rectangular region. - **s** select records NOT considered inside as specified by **-N** - (and **-A**, **-D**). + (and **area_thresh**, **-D**). - **z** select records NOT within the range specified by **z_subregion**. {J} From 96a41bf3bea27a30042d7284acf6200ab4c10376 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Wed, 27 Oct 2021 23:18:14 +1300 Subject: [PATCH 6/7] Use long aliases instead of short param in description of reverse param Specifically, resolution (D), gridmask (G) and mask (N). These aliases are currently undocumented/disabled, but will be implemented/enabled in the future. --- pygmt/src/select.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pygmt/src/select.py b/pygmt/src/select.py index 2e13169bf66..9f9076f8c86 100644 --- a/pygmt/src/select.py +++ b/pygmt/src/select.py @@ -15,8 +15,11 @@ @fmt_docstring @use_alias( A="area_thresh", + # D="resolution", + # G="gridmask", I="reverse", J="projection", + # N="mask", R="region", V="verbose", Z="z_subregion", @@ -72,11 +75,11 @@ def select(data=None, outfile=None, **kwargs): - **c** select records NOT inside any point's circle of influence. - **f** select records NOT inside any of the polygons. - **g** will pass records inside the cells with z equal zero of the - grid mask in **-G**. + grid mask in **gridmask**. - **l** select records NOT within the specified distance of any line. - **r** select records NOT inside the specified rectangular region. - - **s** select records NOT considered inside as specified by **-N** - (and **area_thresh**, **-D**). + - **s** select records NOT considered inside as specified by **mask** + (and **area_thresh**, **resolution**). - **z** select records NOT within the range specified by **z_subregion**. {J} From 4d003de4cd39ba7993182c80b542068bb8c1b65f Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Thu, 28 Oct 2021 11:13:33 +1300 Subject: [PATCH 7/7] Alias resolution (D), gridmask (G) and mask (N) for gmtselect --- pygmt/src/select.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/pygmt/src/select.py b/pygmt/src/select.py index 9f9076f8c86..2c8256cad6f 100644 --- a/pygmt/src/select.py +++ b/pygmt/src/select.py @@ -15,11 +15,11 @@ @fmt_docstring @use_alias( A="area_thresh", - # D="resolution", - # G="gridmask", + D="resolution", + G="gridmask", I="reverse", J="projection", - # N="mask", + N="mask", R="region", V="verbose", Z="z_subregion", @@ -35,7 +35,7 @@ s="skiprows", w="wrap", ) -@kwargs_to_strings(R="sequence", i="sequence_comma", o="sequence_comma") +@kwargs_to_strings(M="sequence", R="sequence", i="sequence_comma", o="sequence_comma") def select(data=None, outfile=None, **kwargs): r""" Select data table subsets based on multiple spatial criteria. @@ -68,6 +68,19 @@ def select(data=None, outfile=None, **kwargs): outfile : str The file name for the output ASCII file. {A} + resolution : str + *resolution*\ [**+f**]. + Ignored unless **mask** is set. Selects the resolution of the coastline + data set to use ((**f**)ull, (**h**)igh, (**i**)ntermediate, (**l**)ow, + or (**c**)rude). The resolution drops off by ~80% between data sets. + [Default is **l**]. Append (**+f**) to automatically select a lower + resolution should the one requested not be available [Default is abort + if not found]. Note that because the coastlines differ in details it is + not guaranteed that a point will remain inside [or outside] when a + different resolution is selected. + gridmask : str + Pass all locations that are inside the valid data area of the grid + *gridmask*. Nodes that are outside are either NaN or zero. reverse : str [**cflrsz**]. Reverses the sense of the test for each of the criteria specified: @@ -83,6 +96,16 @@ def select(data=None, outfile=None, **kwargs): - **z** select records NOT within the range specified by **z_subregion**. {J} + mask : str or list + Pass all records whose location is inside specified geographical + features. Specify if records should be skipped (s) or kept (k) using + 1 of 2 formats: + + - *wet/dry*. + - *ocean/land/lake/island/pond*. + + [Default is s/k/s/k/s (i.e., s/k), which passes all points on dry + land]. {R} {V} z_subregion : str