Adds a module to convert pandas dataframes to netCDF files.

Intended for conversion to DSG netCDF format, but if not enough parameters are set by the user, a non-CF compliant netCDF will be written, but warned about. This should work for time series, profiles, or trajectories, with main testing done against METAR dataframes.
Unidata · Jun 27, 2019 · 9a6af29 · 9a6af29
1 parent d3c2940
commit 9a6af29
Show file tree

Hide file tree

Showing 5 changed files with 9,560 additions and 4 deletions.
diff --git a/metpy/io/__init__.py b/metpy/io/__init__.py
@@ -1,15 +1,20 @@
 # Copyright (c) 2015,2016,2018 MetPy Developers.
 # Distributed under the terms of the BSD 3-Clause License.
 # SPDX-License-Identifier: BSD-3-Clause
-"""Classes for reading various file formats.
+"""Classes for reading and writing various file formats.
 
-These classes are written to take both file names (for local files) or file-like objects;
-this allows reading files that are already in memory (using :class:`python:io.StringIO`)
-or remote files (using :func:`~python:urllib.request.urlopen`).
+The gini and nexrad classes are written to take both file names (for local files)
+or file-like objects; this allows reading files that are already in memory (using
+:class:`python:io.StringIO`) or remote files (using :func:`~python:urllib.request.urlopen`).
+
+The DataframeToNetCDF class take a pandas dataframe and writes a netCDF file (in DSG
+format if applicable).
 """
 
 from .gini import *  # noqa: F403
 from .nexrad import *  # noqa: F403
+from .pandas_to_netcdf import * # noga: F403
 
 __all__ = gini.__all__[:]  # pylint: disable=undefined-variable
 __all__.extend(nexrad.__all__)  # pylint: disable=undefined-variable
+__all__.extend(pandas_to_netcdf.__all__) # pylint: disable=undefined-variable
diff --git a/metpy/io/pandas_to_netcdf.py b/metpy/io/pandas_to_netcdf.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2019 MetPy Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+"""Support reading a pandas datafram to a DSG netCDF."""
+
+import importlib
+import logging
+from os import path
+
+from numpy import arange
+import xarray as xr
+
+from ..package_tools import Exporter
+
+exporter = Exporter(globals())
+
+log = logging.getLogger(__name__)
+
+
+@exporter.export
+class DataframeToNetCDF(object):
+    r"""Convert a Pandas DataFrame into a DSG netCDF.
+
+    This class is designed to take a pandas DataFrame and convert it into a
+    discrete sampling geometry netCDF file. The imagined use cases are for surface
+    observations and atmospheric profiles, but the class should handle any point data,
+    including time series, profiles, and trajectories.
+
+    """
+
+    def __init__(self, df, sampling_var, sampling_data_vars, path_to_save, netcdf_format=None,
+                 column_units=None, standard_names=None, long_names=None, dataset_type=None):
+        r"""Take a Pandas DataFrame and convert it to a netCDF file.
+
+        If given a Pandas DataFrame, this function will first convert
+        it to a xarray Dataset, attach attributes and metadata to it as
+        provided by the user, and then save it as a CF-compliant discrete
+        sampling geometry (DGS) netCDF file. Assumes each row of the DataFrame
+        is a unique observation
+
+        This function is ideal for point data, such as station observations,
+        or for trajectory or profile data, which is discretely sampled at
+        individual points
+
+        Parameters
+        ----------
+        df : `pandas.DataFrame`
+            Point data in pandas dataframe.
+
+        sampling_var : str
+            Column name that is the sampling dimension: for surface observations,
+            this is the column that contains the station identifier/name
+
+        sampling_data_vars : list
+            List of all variables associated with the sampling variable that do not
+            vary with time, such as latitude, longitude, and elevation for
+            surface observations
+
+        path_to_save : str
+            Path, including filename, for where to save netCDF file.
+
+        netcdf_format : str, optional
+
+        column_units : dict, optional
+            Dictionary of units to attach to columns of the dataframe. Overrides
+            the units attribute if it is attached to the dataframe.
+
+        standard_names : dict, optional
+            Dictionary of variable descriptions that are CF-compliant
+
+        long_names : dict, optional
+            Dictionary of longer variable descriptions that provide more detail
+            than standard_names
+
+        dataset_type: str, optional
+            Type of dataset to be converted. Options are 'timeSeries', 'profile',
+            or 'trajectory'. While optional, this variable should be declared to create
+            a CF-compliant DSG netCDF file.
+
+        Returns
+        -------
+        NetCDF file saved to `path_to_save`.
+
+        """
+        # Verify_integrity must be true in order for conversion to netCDF to work
+        # Return a TypeError if not provided a Pandas DataFrame
+        try:
+            # Create the dimensions for use later in netCDF file
+            samplingindex = df.groupby([sampling_var], sort=False).ngroup()
+            obs = arange(0, len(df))
+            df.insert(0, 'samplingIndex', samplingindex)
+            df.insert(1, 'observations', obs)
+
+            # Handle the sampling location specific data
+            sampling_data = df[sampling_data_vars]
+            samples = sampling_data.groupby([sampling_var], sort=False).ngroup()
+            sampling_data.insert(0, 'samples', samples)
+            sampling_data = sampling_data.groupby('samples').first()
+            dataset_samples = xr.Dataset.from_dataframe(sampling_data)
+
+            # Create the dataset for the variables of each observation
+            df = df.drop(sampling_data_vars, axis=1)
+            df = df.set_index(['observations'], verify_integrity=True)
+            dataset_var = xr.Dataset.from_dataframe(df)
+
+            # Merge the two datasets together
+            dataset_final = xr.merge([dataset_samples, dataset_var], compat='no_conflicts')
+
+        except (AttributeError, ValueError, TypeError):
+            raise TypeError('A pandas dataframe was not provided')
+
+        # Attach variable-specific metadata
+        if column_units:
+            column_units['samples'] = ''
+            column_units['observations'] = ''
+            column_units['samplingIndex'] = ''
+            for var in dataset_final.variables:
+                dataset_final[var].attrs["units"] = column_units[var]
+        if standard_names:
+            standard_names['samples'] = ''
+            standard_names['observations'] = ''
+            standard_names['samplingIndex'] = ''
+            for var in dataset_final.variables:
+                dataset_final[var].attrs["standard_name"] = standard_names[var]
+        else:
+            log.warning('No standard names provided - netCDF file will not be CF compliant.')
+        if long_names:
+            long_names['samples'] = 'Sampling dimension'
+            long_names['observations'] = 'Observation dimension'
+            long_names['samplingIndex'] = 'Index of station for this observation'
+            for var in dataset_final.variables:
+                dataset_final[var].attrs["long_name"] = long_names[var]
+
+        # Attach dataset-specific metadata
+        if dataset_type:
+            dataset_final.attrs['featureType'] = dataset_type
+        else:
+            log.warning('No dataset type provided - netCDF will not have appropriate metadata'
+                        'for a DSG dataset.')
+        if dataset_type:
+            dataset_final[sampling_var].attrs['cf_role'] = dataset_type.lower() + '_id'
+        dataset_final['samplingIndex'].attrs['instance_dimension'] = 'samples'
+
+        # Determine mode to write to netCDF
+        write_mode = 'w'
+        if path.exists(path_to_save):
+            # Eventually switch to 'a' to allow appending and delete error
+            raise ValueError('File already exists - please delete and run again')
+        # Check if netCDF4 is installed to see how many unlimited dimensions we can use
+        check_netcdf4 = importlib.util.find_spec('netCDF4')
+        # Make sure path is a string to allow netCDF4 to be used - needed for tests to pass
+        path_to_save = str(path_to_save)
+
+        if check_netcdf4 is not None:
+            unlimited_dimensions = ['samples', 'observations']
+        else:
+            # Due to xarray's fallback to scipy if netCDF4-python is not installed
+            # only one dimension can be unlimited. This may cause issues for users
+            log.warning('NetCDF4 not installed - saving as a netCDF3 file with only the'
+                        'observations dimension as unlimited. If netCDF4 or multiple'
+                        'dimensions are desired, run `pip install netCDF4`')
+            unlimited_dimensions = ['observations']
+
+        # Convert to netCDF
+        dataset_final.to_netcdf(path=path_to_save, mode=write_mode, format=netcdf_format,
+                                unlimited_dims=unlimited_dimensions, compute=True)
diff --git a/metpy/io/tests/test_pandas_to_netcdf.py b/metpy/io/tests/test_pandas_to_netcdf.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2019 MetPy Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+"""Test the `pandas_to_netcdf` module."""
+
+import logging
+import os
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from metpy.cbook import get_test_data
+from metpy.io import DataframeToNetCDF
+
+# Turn off the warnings for tests
+logging.getLogger('metpy.io.pandas_to_netcdf').setLevel(logging.CRITICAL)
+
+
+def test_dataframe_to_netcdf_basic(tmpdir):
+    """Test dataframe conversion to netcdf."""
+    df = pd.read_csv(get_test_data('parsed_metars_example3.csv'))
+    DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id',
+                      sampling_data_vars=['station_id', 'latitude', 'longitude', 'elevation'])
+    assert os.path.exists(tmpdir + '/test.nc')
+    data = xr.open_dataset(str(tmpdir) + '/test.nc')
+    assert np.max(data['temperature']) == 47
+
+
+def test_dataframe_to_netcdf_units(tmpdir):
+    """Test units attached via a dictionary."""
+    df = pd.read_csv(get_test_data('parsed_metars_example3.csv'))
+    col_units = {'samples': '', 'observations': '', 'samplingIndex': '', 'station_id': '',
+                 'latitude': 'degrees', 'longitude': 'degrees', 'elevation': 'meters',
+                 'date_time': '', 'day': '', 'time_utc': '', 'wind_direction': 'degrees',
+                 'wind_speed': 'kts', 'wx1': '', 'wx2': '', 'skyc1': '', 'skylev1': 'feet',
+                 'skyc2': '', 'skylev2': 'feet', 'skyc3': '', 'skylev3': 'feet', 'skyc4': '',
+                 'skylev4': 'feet', 'cloudcover': '', 'temperature': 'degC',
+                 'dewpoint': 'degC', 'altimeter': 'inHg', 'sea_level_pressure': 'hPa',
+                 'wx_symbol1': '', 'wx_symbol2': ''}
+    DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id',
+                      sampling_data_vars=['station_id', 'latitude', 'longitude', 'elevation'],
+                      column_units=col_units, dataset_type='timeSeries')
+    data = xr.open_dataset(str(tmpdir) + '/test.nc')
+    assert data['station_id'].attrs['cf_role'] == 'timeseries_id'
+    assert data['temperature'].attrs['units'] == 'degC'
+
+
+def test_dataframe_to_netcdf_names(tmpdir):
+    """Test attachment of standard names via a dictionary."""
+    df = pd.DataFrame({
+        'temperature': pd.Series([1, 2, 2, 3]), 'pressure': pd.Series([1, 2, 2, 3]),
+        'latitude': pd.Series([4, 5, 6, 7]), 'longitude': pd.Series([1, 2, 3, 4]),
+        'station_id': pd.Series(['KFNL', 'KDEN', 'KVPZ', 'KORD'])})
+    long_names = {'temperature': '2-meter air temperature',
+                   'pressure': 'Mean sea-level air pressure', 'latitude': 'Station latitude',
+                   'longitude': 'Station longitude', 'station_id': 'Station identifier'}
+    standard_names = {'temperature': 'air_temperature',
+                      'pressure': 'air_pressure_at_mean_sea_level', 'latitude': 'latitude',
+                      'longitude': 'longitude', 'station_id': 'platform_id'}
+    DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id',
+                      sampling_data_vars=['station_id', 'latitude', 'longitude'],
+                      standard_names=standard_names, long_names=long_names)
+    data = xr.open_dataset(str(tmpdir) + '/test.nc')
+    assert data['temperature'].attrs['standard_name'] == 'air_temperature'
+    assert data['station_id'].attrs['long_name'] == 'Station identifier'
diff --git a/metpy/static-data-manifest.txt b/metpy/static-data-manifest.txt
@@ -141,6 +141,7 @@ nids/Level3_SLC_TV0_20160516_2359.nids 3083ed0c3b19ef22e4fe0becff961065239ed919e
 nids/sn.last e086870fe5b4c441f613e62004ffe9b96c56819b1eefbcb05dc98b63f4aa4163
 nn_bbox0to100.npz 36fbc982f7a42519172c2310afa5f7c312309f34f10d6a5e99030ad35ce7af13
 nov11_sounding.txt 6fa3e0920314a7d55d6e1020eb934e18d9623c5fb1a40aaad546a25ed225e215
+parsed_metars_example3.csv e98f617acd02de3c613f8f65a148357fee6604fc17be0d8e2bd4abe54ef98d5f
 rbf_test.npz f035f4415ea9bf04dcaf8affd7748f6519638655dcce90dad2b54fe0032bf32d
 station_data.txt 3c1b71abb95ef8fe4adf57e47e2ce67f3529c6fe025b546dd40c862999fc5ffe
 timeseries.csv 2d79f8f21ad1fcec12d0e24750d0958631e92c9148adfbd1b7dc8defe8c56fc5