Skip to content

Commit

Permalink
Adds a module to convert pandas dataframes to netCDF files.
Browse files Browse the repository at this point in the history
Intended for conversion to DSG netCDF format, but if not enough
parameters are set by the user, a non-CF compliant netCDF will be
written, but warned about. This should work for time series, profiles,
or trajectories, with main testing done against METAR dataframes.
  • Loading branch information
zbruick committed Jun 27, 2019
1 parent d3c2940 commit 9a6af29
Show file tree
Hide file tree
Showing 5 changed files with 9,560 additions and 4 deletions.
13 changes: 9 additions & 4 deletions metpy/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
# Copyright (c) 2015,2016,2018 MetPy Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
"""Classes for reading various file formats.
"""Classes for reading and writing various file formats.
These classes are written to take both file names (for local files) or file-like objects;
this allows reading files that are already in memory (using :class:`python:io.StringIO`)
or remote files (using :func:`~python:urllib.request.urlopen`).
The gini and nexrad classes are written to take both file names (for local files)
or file-like objects; this allows reading files that are already in memory (using
:class:`python:io.StringIO`) or remote files (using :func:`~python:urllib.request.urlopen`).
The DataframeToNetCDF class take a pandas dataframe and writes a netCDF file (in DSG
format if applicable).
"""

from .gini import * # noqa: F403
from .nexrad import * # noqa: F403
from .pandas_to_netcdf import * # noga: F403

__all__ = gini.__all__[:] # pylint: disable=undefined-variable
__all__.extend(nexrad.__all__) # pylint: disable=undefined-variable
__all__.extend(pandas_to_netcdf.__all__) # pylint: disable=undefined-variable
166 changes: 166 additions & 0 deletions metpy/io/pandas_to_netcdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Copyright (c) 2019 MetPy Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
"""Support reading a pandas datafram to a DSG netCDF."""

import importlib
import logging
from os import path

from numpy import arange
import xarray as xr

from ..package_tools import Exporter

exporter = Exporter(globals())

log = logging.getLogger(__name__)


@exporter.export
class DataframeToNetCDF(object):
r"""Convert a Pandas DataFrame into a DSG netCDF.
This class is designed to take a pandas DataFrame and convert it into a
discrete sampling geometry netCDF file. The imagined use cases are for surface
observations and atmospheric profiles, but the class should handle any point data,
including time series, profiles, and trajectories.
"""

def __init__(self, df, sampling_var, sampling_data_vars, path_to_save, netcdf_format=None,
column_units=None, standard_names=None, long_names=None, dataset_type=None):
r"""Take a Pandas DataFrame and convert it to a netCDF file.
If given a Pandas DataFrame, this function will first convert
it to a xarray Dataset, attach attributes and metadata to it as
provided by the user, and then save it as a CF-compliant discrete
sampling geometry (DGS) netCDF file. Assumes each row of the DataFrame
is a unique observation
This function is ideal for point data, such as station observations,
or for trajectory or profile data, which is discretely sampled at
individual points
Parameters
----------
df : `pandas.DataFrame`
Point data in pandas dataframe.
sampling_var : str
Column name that is the sampling dimension: for surface observations,
this is the column that contains the station identifier/name
sampling_data_vars : list
List of all variables associated with the sampling variable that do not
vary with time, such as latitude, longitude, and elevation for
surface observations
path_to_save : str
Path, including filename, for where to save netCDF file.
netcdf_format : str, optional
column_units : dict, optional
Dictionary of units to attach to columns of the dataframe. Overrides
the units attribute if it is attached to the dataframe.
standard_names : dict, optional
Dictionary of variable descriptions that are CF-compliant
long_names : dict, optional
Dictionary of longer variable descriptions that provide more detail
than standard_names
dataset_type: str, optional
Type of dataset to be converted. Options are 'timeSeries', 'profile',
or 'trajectory'. While optional, this variable should be declared to create
a CF-compliant DSG netCDF file.
Returns
-------
NetCDF file saved to `path_to_save`.
"""
# Verify_integrity must be true in order for conversion to netCDF to work
# Return a TypeError if not provided a Pandas DataFrame
try:
# Create the dimensions for use later in netCDF file
samplingindex = df.groupby([sampling_var], sort=False).ngroup()
obs = arange(0, len(df))
df.insert(0, 'samplingIndex', samplingindex)
df.insert(1, 'observations', obs)

# Handle the sampling location specific data
sampling_data = df[sampling_data_vars]
samples = sampling_data.groupby([sampling_var], sort=False).ngroup()
sampling_data.insert(0, 'samples', samples)
sampling_data = sampling_data.groupby('samples').first()
dataset_samples = xr.Dataset.from_dataframe(sampling_data)

# Create the dataset for the variables of each observation
df = df.drop(sampling_data_vars, axis=1)
df = df.set_index(['observations'], verify_integrity=True)
dataset_var = xr.Dataset.from_dataframe(df)

# Merge the two datasets together
dataset_final = xr.merge([dataset_samples, dataset_var], compat='no_conflicts')

except (AttributeError, ValueError, TypeError):
raise TypeError('A pandas dataframe was not provided')

# Attach variable-specific metadata
if column_units:
column_units['samples'] = ''
column_units['observations'] = ''
column_units['samplingIndex'] = ''
for var in dataset_final.variables:
dataset_final[var].attrs["units"] = column_units[var]
if standard_names:
standard_names['samples'] = ''
standard_names['observations'] = ''
standard_names['samplingIndex'] = ''
for var in dataset_final.variables:
dataset_final[var].attrs["standard_name"] = standard_names[var]
else:
log.warning('No standard names provided - netCDF file will not be CF compliant.')
if long_names:
long_names['samples'] = 'Sampling dimension'
long_names['observations'] = 'Observation dimension'
long_names['samplingIndex'] = 'Index of station for this observation'
for var in dataset_final.variables:
dataset_final[var].attrs["long_name"] = long_names[var]

# Attach dataset-specific metadata
if dataset_type:
dataset_final.attrs['featureType'] = dataset_type
else:
log.warning('No dataset type provided - netCDF will not have appropriate metadata'
'for a DSG dataset.')
if dataset_type:
dataset_final[sampling_var].attrs['cf_role'] = dataset_type.lower() + '_id'
dataset_final['samplingIndex'].attrs['instance_dimension'] = 'samples'

# Determine mode to write to netCDF
write_mode = 'w'
if path.exists(path_to_save):
# Eventually switch to 'a' to allow appending and delete error
raise ValueError('File already exists - please delete and run again')
# Check if netCDF4 is installed to see how many unlimited dimensions we can use
check_netcdf4 = importlib.util.find_spec('netCDF4')
# Make sure path is a string to allow netCDF4 to be used - needed for tests to pass
path_to_save = str(path_to_save)

if check_netcdf4 is not None:
unlimited_dimensions = ['samples', 'observations']
else:
# Due to xarray's fallback to scipy if netCDF4-python is not installed
# only one dimension can be unlimited. This may cause issues for users
log.warning('NetCDF4 not installed - saving as a netCDF3 file with only the'
'observations dimension as unlimited. If netCDF4 or multiple'
'dimensions are desired, run `pip install netCDF4`')
unlimited_dimensions = ['observations']

# Convert to netCDF
dataset_final.to_netcdf(path=path_to_save, mode=write_mode, format=netcdf_format,
unlimited_dims=unlimited_dimensions, compute=True)
66 changes: 66 additions & 0 deletions metpy/io/tests/test_pandas_to_netcdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright (c) 2019 MetPy Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
"""Test the `pandas_to_netcdf` module."""

import logging
import os

import numpy as np
import pandas as pd
import xarray as xr

from metpy.cbook import get_test_data
from metpy.io import DataframeToNetCDF

# Turn off the warnings for tests
logging.getLogger('metpy.io.pandas_to_netcdf').setLevel(logging.CRITICAL)


def test_dataframe_to_netcdf_basic(tmpdir):
"""Test dataframe conversion to netcdf."""
df = pd.read_csv(get_test_data('parsed_metars_example3.csv'))
DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id',
sampling_data_vars=['station_id', 'latitude', 'longitude', 'elevation'])
assert os.path.exists(tmpdir + '/test.nc')
data = xr.open_dataset(str(tmpdir) + '/test.nc')
assert np.max(data['temperature']) == 47


def test_dataframe_to_netcdf_units(tmpdir):
"""Test units attached via a dictionary."""
df = pd.read_csv(get_test_data('parsed_metars_example3.csv'))
col_units = {'samples': '', 'observations': '', 'samplingIndex': '', 'station_id': '',
'latitude': 'degrees', 'longitude': 'degrees', 'elevation': 'meters',
'date_time': '', 'day': '', 'time_utc': '', 'wind_direction': 'degrees',
'wind_speed': 'kts', 'wx1': '', 'wx2': '', 'skyc1': '', 'skylev1': 'feet',
'skyc2': '', 'skylev2': 'feet', 'skyc3': '', 'skylev3': 'feet', 'skyc4': '',
'skylev4': 'feet', 'cloudcover': '', 'temperature': 'degC',
'dewpoint': 'degC', 'altimeter': 'inHg', 'sea_level_pressure': 'hPa',
'wx_symbol1': '', 'wx_symbol2': ''}
DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id',
sampling_data_vars=['station_id', 'latitude', 'longitude', 'elevation'],
column_units=col_units, dataset_type='timeSeries')
data = xr.open_dataset(str(tmpdir) + '/test.nc')
assert data['station_id'].attrs['cf_role'] == 'timeseries_id'
assert data['temperature'].attrs['units'] == 'degC'


def test_dataframe_to_netcdf_names(tmpdir):
"""Test attachment of standard names via a dictionary."""
df = pd.DataFrame({
'temperature': pd.Series([1, 2, 2, 3]), 'pressure': pd.Series([1, 2, 2, 3]),
'latitude': pd.Series([4, 5, 6, 7]), 'longitude': pd.Series([1, 2, 3, 4]),
'station_id': pd.Series(['KFNL', 'KDEN', 'KVPZ', 'KORD'])})
long_names = {'temperature': '2-meter air temperature',
'pressure': 'Mean sea-level air pressure', 'latitude': 'Station latitude',
'longitude': 'Station longitude', 'station_id': 'Station identifier'}
standard_names = {'temperature': 'air_temperature',
'pressure': 'air_pressure_at_mean_sea_level', 'latitude': 'latitude',
'longitude': 'longitude', 'station_id': 'platform_id'}
DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id',
sampling_data_vars=['station_id', 'latitude', 'longitude'],
standard_names=standard_names, long_names=long_names)
data = xr.open_dataset(str(tmpdir) + '/test.nc')
assert data['temperature'].attrs['standard_name'] == 'air_temperature'
assert data['station_id'].attrs['long_name'] == 'Station identifier'
1 change: 1 addition & 0 deletions metpy/static-data-manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ nids/Level3_SLC_TV0_20160516_2359.nids 3083ed0c3b19ef22e4fe0becff961065239ed919e
nids/sn.last e086870fe5b4c441f613e62004ffe9b96c56819b1eefbcb05dc98b63f4aa4163
nn_bbox0to100.npz 36fbc982f7a42519172c2310afa5f7c312309f34f10d6a5e99030ad35ce7af13
nov11_sounding.txt 6fa3e0920314a7d55d6e1020eb934e18d9623c5fb1a40aaad546a25ed225e215
parsed_metars_example3.csv e98f617acd02de3c613f8f65a148357fee6604fc17be0d8e2bd4abe54ef98d5f
rbf_test.npz f035f4415ea9bf04dcaf8affd7748f6519638655dcce90dad2b54fe0032bf32d
station_data.txt 3c1b71abb95ef8fe4adf57e47e2ce67f3529c6fe025b546dd40c862999fc5ffe
timeseries.csv 2d79f8f21ad1fcec12d0e24750d0958631e92c9148adfbd1b7dc8defe8c56fc5
Expand Down
Loading

0 comments on commit 9a6af29

Please sign in to comment.