-
Notifications
You must be signed in to change notification settings - Fork 415
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds a module to convert pandas dataframes to netCDF files.
Intended for conversion to DSG netCDF format, but if not enough parameters are set by the user, a non-CF compliant netCDF will be written, but warned about. This should work for time series, profiles, or trajectories, with main testing done against METAR dataframes.
- Loading branch information
Showing
5 changed files
with
9,560 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,20 @@ | ||
# Copyright (c) 2015,2016,2018 MetPy Developers. | ||
# Distributed under the terms of the BSD 3-Clause License. | ||
# SPDX-License-Identifier: BSD-3-Clause | ||
"""Classes for reading various file formats. | ||
"""Classes for reading and writing various file formats. | ||
These classes are written to take both file names (for local files) or file-like objects; | ||
this allows reading files that are already in memory (using :class:`python:io.StringIO`) | ||
or remote files (using :func:`~python:urllib.request.urlopen`). | ||
The gini and nexrad classes are written to take both file names (for local files) | ||
or file-like objects; this allows reading files that are already in memory (using | ||
:class:`python:io.StringIO`) or remote files (using :func:`~python:urllib.request.urlopen`). | ||
The DataframeToNetCDF class take a pandas dataframe and writes a netCDF file (in DSG | ||
format if applicable). | ||
""" | ||
|
||
from .gini import * # noqa: F403 | ||
from .nexrad import * # noqa: F403 | ||
from .pandas_to_netcdf import * # noga: F403 | ||
|
||
__all__ = gini.__all__[:] # pylint: disable=undefined-variable | ||
__all__.extend(nexrad.__all__) # pylint: disable=undefined-variable | ||
__all__.extend(pandas_to_netcdf.__all__) # pylint: disable=undefined-variable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
# Copyright (c) 2019 MetPy Developers. | ||
# Distributed under the terms of the BSD 3-Clause License. | ||
# SPDX-License-Identifier: BSD-3-Clause | ||
"""Support reading a pandas datafram to a DSG netCDF.""" | ||
|
||
import importlib | ||
import logging | ||
from os import path | ||
|
||
from numpy import arange | ||
import xarray as xr | ||
|
||
from ..package_tools import Exporter | ||
|
||
exporter = Exporter(globals()) | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
@exporter.export | ||
class DataframeToNetCDF(object): | ||
r"""Convert a Pandas DataFrame into a DSG netCDF. | ||
This class is designed to take a pandas DataFrame and convert it into a | ||
discrete sampling geometry netCDF file. The imagined use cases are for surface | ||
observations and atmospheric profiles, but the class should handle any point data, | ||
including time series, profiles, and trajectories. | ||
""" | ||
|
||
def __init__(self, df, sampling_var, sampling_data_vars, path_to_save, netcdf_format=None, | ||
column_units=None, standard_names=None, long_names=None, dataset_type=None): | ||
r"""Take a Pandas DataFrame and convert it to a netCDF file. | ||
If given a Pandas DataFrame, this function will first convert | ||
it to a xarray Dataset, attach attributes and metadata to it as | ||
provided by the user, and then save it as a CF-compliant discrete | ||
sampling geometry (DGS) netCDF file. Assumes each row of the DataFrame | ||
is a unique observation | ||
This function is ideal for point data, such as station observations, | ||
or for trajectory or profile data, which is discretely sampled at | ||
individual points | ||
Parameters | ||
---------- | ||
df : `pandas.DataFrame` | ||
Point data in pandas dataframe. | ||
sampling_var : str | ||
Column name that is the sampling dimension: for surface observations, | ||
this is the column that contains the station identifier/name | ||
sampling_data_vars : list | ||
List of all variables associated with the sampling variable that do not | ||
vary with time, such as latitude, longitude, and elevation for | ||
surface observations | ||
path_to_save : str | ||
Path, including filename, for where to save netCDF file. | ||
netcdf_format : str, optional | ||
column_units : dict, optional | ||
Dictionary of units to attach to columns of the dataframe. Overrides | ||
the units attribute if it is attached to the dataframe. | ||
standard_names : dict, optional | ||
Dictionary of variable descriptions that are CF-compliant | ||
long_names : dict, optional | ||
Dictionary of longer variable descriptions that provide more detail | ||
than standard_names | ||
dataset_type: str, optional | ||
Type of dataset to be converted. Options are 'timeSeries', 'profile', | ||
or 'trajectory'. While optional, this variable should be declared to create | ||
a CF-compliant DSG netCDF file. | ||
Returns | ||
------- | ||
NetCDF file saved to `path_to_save`. | ||
""" | ||
# Verify_integrity must be true in order for conversion to netCDF to work | ||
# Return a TypeError if not provided a Pandas DataFrame | ||
try: | ||
# Create the dimensions for use later in netCDF file | ||
samplingindex = df.groupby([sampling_var], sort=False).ngroup() | ||
obs = arange(0, len(df)) | ||
df.insert(0, 'samplingIndex', samplingindex) | ||
df.insert(1, 'observations', obs) | ||
|
||
# Handle the sampling location specific data | ||
sampling_data = df[sampling_data_vars] | ||
samples = sampling_data.groupby([sampling_var], sort=False).ngroup() | ||
sampling_data.insert(0, 'samples', samples) | ||
sampling_data = sampling_data.groupby('samples').first() | ||
dataset_samples = xr.Dataset.from_dataframe(sampling_data) | ||
|
||
# Create the dataset for the variables of each observation | ||
df = df.drop(sampling_data_vars, axis=1) | ||
df = df.set_index(['observations'], verify_integrity=True) | ||
dataset_var = xr.Dataset.from_dataframe(df) | ||
|
||
# Merge the two datasets together | ||
dataset_final = xr.merge([dataset_samples, dataset_var], compat='no_conflicts') | ||
|
||
except (AttributeError, ValueError, TypeError): | ||
raise TypeError('A pandas dataframe was not provided') | ||
|
||
# Attach variable-specific metadata | ||
if column_units: | ||
column_units['samples'] = '' | ||
column_units['observations'] = '' | ||
column_units['samplingIndex'] = '' | ||
for var in dataset_final.variables: | ||
dataset_final[var].attrs["units"] = column_units[var] | ||
if standard_names: | ||
standard_names['samples'] = '' | ||
standard_names['observations'] = '' | ||
standard_names['samplingIndex'] = '' | ||
for var in dataset_final.variables: | ||
dataset_final[var].attrs["standard_name"] = standard_names[var] | ||
else: | ||
log.warning('No standard names provided - netCDF file will not be CF compliant.') | ||
if long_names: | ||
long_names['samples'] = 'Sampling dimension' | ||
long_names['observations'] = 'Observation dimension' | ||
long_names['samplingIndex'] = 'Index of station for this observation' | ||
for var in dataset_final.variables: | ||
dataset_final[var].attrs["long_name"] = long_names[var] | ||
|
||
# Attach dataset-specific metadata | ||
if dataset_type: | ||
dataset_final.attrs['featureType'] = dataset_type | ||
else: | ||
log.warning('No dataset type provided - netCDF will not have appropriate metadata' | ||
'for a DSG dataset.') | ||
if dataset_type: | ||
dataset_final[sampling_var].attrs['cf_role'] = dataset_type.lower() + '_id' | ||
dataset_final['samplingIndex'].attrs['instance_dimension'] = 'samples' | ||
|
||
# Determine mode to write to netCDF | ||
write_mode = 'w' | ||
if path.exists(path_to_save): | ||
# Eventually switch to 'a' to allow appending and delete error | ||
raise ValueError('File already exists - please delete and run again') | ||
# Check if netCDF4 is installed to see how many unlimited dimensions we can use | ||
check_netcdf4 = importlib.util.find_spec('netCDF4') | ||
# Make sure path is a string to allow netCDF4 to be used - needed for tests to pass | ||
path_to_save = str(path_to_save) | ||
|
||
if check_netcdf4 is not None: | ||
unlimited_dimensions = ['samples', 'observations'] | ||
else: | ||
# Due to xarray's fallback to scipy if netCDF4-python is not installed | ||
# only one dimension can be unlimited. This may cause issues for users | ||
log.warning('NetCDF4 not installed - saving as a netCDF3 file with only the' | ||
'observations dimension as unlimited. If netCDF4 or multiple' | ||
'dimensions are desired, run `pip install netCDF4`') | ||
unlimited_dimensions = ['observations'] | ||
|
||
# Convert to netCDF | ||
dataset_final.to_netcdf(path=path_to_save, mode=write_mode, format=netcdf_format, | ||
unlimited_dims=unlimited_dimensions, compute=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Copyright (c) 2019 MetPy Developers. | ||
# Distributed under the terms of the BSD 3-Clause License. | ||
# SPDX-License-Identifier: BSD-3-Clause | ||
"""Test the `pandas_to_netcdf` module.""" | ||
|
||
import logging | ||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import xarray as xr | ||
|
||
from metpy.cbook import get_test_data | ||
from metpy.io import DataframeToNetCDF | ||
|
||
# Turn off the warnings for tests | ||
logging.getLogger('metpy.io.pandas_to_netcdf').setLevel(logging.CRITICAL) | ||
|
||
|
||
def test_dataframe_to_netcdf_basic(tmpdir): | ||
"""Test dataframe conversion to netcdf.""" | ||
df = pd.read_csv(get_test_data('parsed_metars_example3.csv')) | ||
DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id', | ||
sampling_data_vars=['station_id', 'latitude', 'longitude', 'elevation']) | ||
assert os.path.exists(tmpdir + '/test.nc') | ||
data = xr.open_dataset(str(tmpdir) + '/test.nc') | ||
assert np.max(data['temperature']) == 47 | ||
|
||
|
||
def test_dataframe_to_netcdf_units(tmpdir): | ||
"""Test units attached via a dictionary.""" | ||
df = pd.read_csv(get_test_data('parsed_metars_example3.csv')) | ||
col_units = {'samples': '', 'observations': '', 'samplingIndex': '', 'station_id': '', | ||
'latitude': 'degrees', 'longitude': 'degrees', 'elevation': 'meters', | ||
'date_time': '', 'day': '', 'time_utc': '', 'wind_direction': 'degrees', | ||
'wind_speed': 'kts', 'wx1': '', 'wx2': '', 'skyc1': '', 'skylev1': 'feet', | ||
'skyc2': '', 'skylev2': 'feet', 'skyc3': '', 'skylev3': 'feet', 'skyc4': '', | ||
'skylev4': 'feet', 'cloudcover': '', 'temperature': 'degC', | ||
'dewpoint': 'degC', 'altimeter': 'inHg', 'sea_level_pressure': 'hPa', | ||
'wx_symbol1': '', 'wx_symbol2': ''} | ||
DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id', | ||
sampling_data_vars=['station_id', 'latitude', 'longitude', 'elevation'], | ||
column_units=col_units, dataset_type='timeSeries') | ||
data = xr.open_dataset(str(tmpdir) + '/test.nc') | ||
assert data['station_id'].attrs['cf_role'] == 'timeseries_id' | ||
assert data['temperature'].attrs['units'] == 'degC' | ||
|
||
|
||
def test_dataframe_to_netcdf_names(tmpdir): | ||
"""Test attachment of standard names via a dictionary.""" | ||
df = pd.DataFrame({ | ||
'temperature': pd.Series([1, 2, 2, 3]), 'pressure': pd.Series([1, 2, 2, 3]), | ||
'latitude': pd.Series([4, 5, 6, 7]), 'longitude': pd.Series([1, 2, 3, 4]), | ||
'station_id': pd.Series(['KFNL', 'KDEN', 'KVPZ', 'KORD'])}) | ||
long_names = {'temperature': '2-meter air temperature', | ||
'pressure': 'Mean sea-level air pressure', 'latitude': 'Station latitude', | ||
'longitude': 'Station longitude', 'station_id': 'Station identifier'} | ||
standard_names = {'temperature': 'air_temperature', | ||
'pressure': 'air_pressure_at_mean_sea_level', 'latitude': 'latitude', | ||
'longitude': 'longitude', 'station_id': 'platform_id'} | ||
DataframeToNetCDF(df, path_to_save=tmpdir + '/test.nc', sampling_var='station_id', | ||
sampling_data_vars=['station_id', 'latitude', 'longitude'], | ||
standard_names=standard_names, long_names=long_names) | ||
data = xr.open_dataset(str(tmpdir) + '/test.nc') | ||
assert data['temperature'].attrs['standard_name'] == 'air_temperature' | ||
assert data['station_id'].attrs['long_name'] == 'Station identifier' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.