Skip to content

Commit

Permalink
Allow using Arrow datatypes when converting to Pandas DataFrame (#315)
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve authored Nov 9, 2023
1 parent 58b9808 commit 74a3123
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 19 deletions.
55 changes: 43 additions & 12 deletions nptdms/export/pandas_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import numpy as np


def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True):
def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True,
arrow_dtypes=False):
"""
Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
Expand All @@ -12,6 +13,7 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
values are absolute times or relative to the start time.
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The full TDMS file data.
:rtype: pandas.DataFrame
"""
Expand All @@ -20,10 +22,13 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
for group in tdms_file.groups():
for channel in group.channels():
channels_to_export[channel.path] = channel
return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
return _channels_to_dataframe(
channels_to_export,
time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)


def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
def from_group(group, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts a TDMS group object to a DataFrame. DataFrame columns are named using the channel names.
Expand All @@ -33,15 +38,19 @@ def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
values are absolute times or relative to the start time.
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

channels_to_export = OrderedDict((ch.name, ch) for ch in group.channels())
return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
return _channels_to_dataframe(
channels_to_export, time_index=time_index, absolute_time=absolute_time,
scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)


def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True):
def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True,
arrow_dtypes=False):
"""
Converts the TDMS channel to a DataFrame
Expand All @@ -51,39 +60,61 @@ def from_channel(channel, time_index=False, absolute_time=False, scaled_data=Tru
values are absolute times or relative to the start time.
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

channels_to_export = {channel.path: channel}
return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
return _channels_to_dataframe(
channels_to_export, time_index=time_index, absolute_time=absolute_time,
scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)


def _channels_to_dataframe(channels_to_export, time_index=False, absolute_time=False, scaled_data=True):
def _channels_to_dataframe(
channels_to_export, time_index=False, absolute_time=False, scaled_data=True,
arrow_dtypes=False):
import pandas as pd

dataframe_dict = OrderedDict()
column_data = []
for column_name, channel in channels_to_export.items():
index = channel.time_track(absolute_time) if time_index else None
if scaled_data:
dataframe_dict[column_name] = pd.Series(data=_array_for_pd(channel[:]), index=index)
column_data.append((column_name, _array_for_pd(channel[:]), index))
elif channel.scaler_data_types:
# Channel has DAQmx raw data
raw_data = channel.read_data(scaled=False)
for scale_id, scaler_data in raw_data.items():
scaler_column_name = column_name + "[{0:d}]".format(scale_id)
dataframe_dict[scaler_column_name] = pd.Series(data=scaler_data, index=index)
column_data.append((scaler_column_name, scaler_data, index))
else:
# Raw data for normal TDMS file
raw_data = channel.read_data(scaled=False)
dataframe_dict[column_name] = pd.Series(data=_array_for_pd(raw_data), index=index)
column_data.append((column_name, _array_for_pd(raw_data), index))

dataframe_dict = OrderedDict()
if arrow_dtypes:
import pyarrow as pa

for column_name, data, index in column_data:
# Let arrow deduce data types from the numpy dtypes
if index is not None:
index_array = pa.array(index)
index = pd.Index(index_array, dtype=pd.ArrowDtype(index_array.type))
data_array = pa.array(data)
dataframe_dict[column_name] = pd.Series(
data=data_array, dtype=pd.ArrowDtype(data_array.type), index=index)
else:
for column_name, data, index in column_data:
dataframe_dict[column_name] = pd.Series(data=data, index=index)

return pd.DataFrame.from_dict(dataframe_dict)


def _array_for_pd(array):
""" Convert data array to a format suitable for a Pandas dataframe
"""
if np.issubdtype(array.dtype, np.dtype('void')):
if np.issubdtype(array.dtype, np.dtype('void')) and len(array.dtype) == 0:
# If dtype is void then the array must also be empty.
# Pandas doesn't like void data types, so these are converted to empty float64 arrays
# and Pandas will fill values with NaN
Expand Down
21 changes: 15 additions & 6 deletions nptdms/tdms.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def properties(self):

return self._properties

def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
Expand All @@ -170,11 +170,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The full TDMS file data.
:rtype: pandas.DataFrame
"""

return pandas_export.from_tdms_file(self, time_index, absolute_time, scaled_data)
return pandas_export.from_tdms_file(
self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)

def as_hdf(self, filepath, mode='w', group='/'):
"""
Expand Down Expand Up @@ -388,7 +391,7 @@ def channels(self):
"""
return list(self._channels.values())

def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts the TDMS group to a DataFrame. DataFrame columns are named using the channel names.
Expand All @@ -398,11 +401,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

return pandas_export.from_group(self, time_index, absolute_time, scaled_data)
return pandas_export.from_group(
self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)

def __len__(self):
""" Returns the number of channels in this group
Expand Down Expand Up @@ -692,7 +698,7 @@ def time_track(self, absolute_time=False, accuracy='ns'):
return (start_time +
(relative_time * unit_correction).astype(time_type))

def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts the TDMS channel to a DataFrame. The DataFrame column is named using the channel path.
Expand All @@ -702,11 +708,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

return pandas_export.from_channel(self, time_index, absolute_time, scaled_data)
return pandas_export.from_channel(
self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)

def _read_data_values(self):
for chunk in self.data_chunks():
Expand Down
113 changes: 112 additions & 1 deletion nptdms/test/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_file_as_dataframe_with_absolute_time():
df = tdms_data.as_dataframe(time_index=True, absolute_time=True)

expected_start = datetime(2015, 9, 8, 10, 5, 49)
assert (df.index == expected_start)[0]
assert (df.index[0] == expected_start)


@pytest.mark.parametrize('lazy_load', [True, False])
Expand Down Expand Up @@ -321,6 +321,117 @@ def test_raw_daqmx_channel_export(lazy_load):
np.testing.assert_equal(dataframe["/'Group'/'Channel1'[1]"], expected_data[1])


@pytest.mark.parametrize('abs_time_index', [False, True])
def test_dataframe_with_arrow_types(abs_time_index):
test_file = GeneratedFile()
test_file.add_segment(*timed_segment())

tdms_data = test_file.load()

file_df = tdms_data.as_dataframe(
arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)

group_df = tdms_data['Group'].as_dataframe(
arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)

channel_df = tdms_data['Group']['Channel1'].as_dataframe(
arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)

assert len(file_df) == 2
assert "/'Group'/'Channel1'" in file_df.keys()
assert "/'Group'/'Channel2'" in file_df.keys()

def check_series(series):
assert (series == [1, 2]).all()
assert series.dtype == "int32[pyarrow]"
if abs_time_index:
assert series.index.dtype == "timestamp[ns][pyarrow]"

check_series(file_df["/'Group'/'Channel1'"])
check_series(group_df['Channel1'])
check_series(channel_df["/'Group'/'Channel1'"])


@pytest.mark.parametrize('arrow_dtypes', [False, True])
def test_bool_data_to_pandas(arrow_dtypes):
test_file, expected_data = scenarios.bool_data().values
df = test_file.load()['group'].as_dataframe(arrow_dtypes=arrow_dtypes)
np.testing.assert_allclose(df['bool_channel'], expected_data[('group', 'bool_channel')])


@pytest.mark.parametrize('arrow_dtypes', [False, True])
def test_string_data_to_pandas(arrow_dtypes):
strings = ["abcdefg", "qwertyuiop"]

test_file = GeneratedFile()
toc = ("kTocMetaData", "kTocRawData", "kTocNewObjList")
metadata = (
# Number of objects
"01 00 00 00"
# Length of the object path
"18 00 00 00")
metadata += string_hexlify("/'Group'/'StringChannel'")
metadata += (
# Length of index information
"1C 00 00 00"
# Raw data data type
"20 00 00 00"
# Dimension
"01 00 00 00"
# Number of raw data values
"02 00 00 00"
"00 00 00 00"
# Number of bytes in data
"19 00 00 00"
"00 00 00 00"
# Number of properties (0)
"00 00 00 00")
data = (
"07 00 00 00" # index to after first string
"11 00 00 00" # index to after second string
)
for string in strings:
data += string_hexlify(string)
test_file.add_segment(toc, metadata, data)
tdms_data = test_file.load()

series = tdms_data["Group"].as_dataframe(arrow_dtypes=arrow_dtypes)["StringChannel"]

assert len(series) == len(strings)
for expected, read in zip(strings, series):
assert expected == read


def test_dataframe_with_complex_data():
test_file, expected_data = scenarios.complex_data().values
df = test_file.load()['group'].as_dataframe()
np.testing.assert_allclose(df['complex_single_channel'], expected_data[('group', 'complex_single_channel')])
np.testing.assert_allclose(df['complex_double_channel'], expected_data[('group', 'complex_double_channel')])


def test_dataframe_with_raw_timestamp_data():
test_file = GeneratedFile()
seconds = 3672033330
second_fractions = 1234567890 * 10 ** 10
test_file.add_segment(
("kTocMetaData", "kTocRawData", "kTocNewObjList"),
segment_objects_metadata(
channel_metadata("/'group'/'channel1'", 0x44, 4),
),
hexlify_value("<Q", 0) + hexlify_value("<q", seconds) +
hexlify_value("<Q", second_fractions) + hexlify_value("<q", seconds) +
hexlify_value("<Q", 0) + hexlify_value("<q", seconds + 1) +
hexlify_value("<Q", second_fractions) + hexlify_value("<q", seconds + 1)
)

with test_file.get_tempfile() as temp_file:
tdms_data = TdmsFile.read(temp_file.file, raw_timestamps=True)
with pytest.raises(ValueError) as exc_info:
tdms_data['group'].as_dataframe()
message = str(exc_info.value)
assert "compound dtype" in message


def test_export_with_empty_channels():
"""Convert a group to dataframe when a channel has empty data and void data type"""

Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ test =
scipy
pandas =
pandas
pyarrow
hdf =
h5py >= 2.10.0
thermocouple_scaling =
Expand Down

0 comments on commit 74a3123

Please sign in to comment.