Allow using Arrow datatypes when converting to Pandas DataFrame (#315)

adamreeve · Nov 9, 2023 · 74a3123 · 74a3123
1 parent 58b9808
commit 74a3123
Show file tree

Hide file tree

Showing 4 changed files with 171 additions and 19 deletions.
diff --git a/nptdms/export/pandas_export.py b/nptdms/export/pandas_export.py
@@ -2,7 +2,8 @@
 import numpy as np
 
 
-def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True):
+def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True,
+                   arrow_dtypes=False):
     """
     Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
 
@@ -12,6 +13,7 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
         values are absolute times or relative to the start time.
     :param scaled_data: By default the scaled data will be used.
         Set to False to use raw unscaled data.
+    :param arrow_dtypes: Use PyArrow data types in the DataFrame.
     :return: The full TDMS file data.
     :rtype: pandas.DataFrame
     """
@@ -20,10 +22,13 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
     for group in tdms_file.groups():
         for channel in group.channels():
             channels_to_export[channel.path] = channel
-    return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
+    return _channels_to_dataframe(
+        channels_to_export,
+        time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+        arrow_dtypes=arrow_dtypes)
 
 
-def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
+def from_group(group, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
     """
     Converts a TDMS group object to a DataFrame. DataFrame columns are named using the channel names.
 
@@ -33,15 +38,19 @@ def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
         values are absolute times or relative to the start time.
     :param scaled_data: By default the scaled data will be used.
         Set to False to use raw unscaled data.
+    :param arrow_dtypes: Use PyArrow data types in the DataFrame.
     :return: The TDMS object data.
     :rtype: pandas.DataFrame
     """
 
     channels_to_export = OrderedDict((ch.name, ch) for ch in group.channels())
-    return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
+    return _channels_to_dataframe(
+        channels_to_export, time_index=time_index, absolute_time=absolute_time,
+        scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)
 
 
-def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True):
+def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True,
+                 arrow_dtypes=False):
     """
     Converts the TDMS channel to a DataFrame
 
@@ -51,39 +60,61 @@ def from_channel(channel, time_index=False, absolute_time=False, scaled_data=Tru
         values are absolute times or relative to the start time.
     :param scaled_data: By default the scaled data will be used.
         Set to False to use raw unscaled data.
+    :param arrow_dtypes: Use PyArrow data types in the DataFrame.
     :return: The TDMS object data.
     :rtype: pandas.DataFrame
     """
 
     channels_to_export = {channel.path: channel}
-    return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
+    return _channels_to_dataframe(
+        channels_to_export, time_index=time_index, absolute_time=absolute_time,
+        scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)
 
 
-def _channels_to_dataframe(channels_to_export, time_index=False, absolute_time=False, scaled_data=True):
+def _channels_to_dataframe(
+        channels_to_export, time_index=False, absolute_time=False, scaled_data=True,
+        arrow_dtypes=False):
     import pandas as pd
 
-    dataframe_dict = OrderedDict()
+    column_data = []
     for column_name, channel in channels_to_export.items():
         index = channel.time_track(absolute_time) if time_index else None
         if scaled_data:
-            dataframe_dict[column_name] = pd.Series(data=_array_for_pd(channel[:]), index=index)
+            column_data.append((column_name, _array_for_pd(channel[:]), index))
         elif channel.scaler_data_types:
             # Channel has DAQmx raw data
             raw_data = channel.read_data(scaled=False)
             for scale_id, scaler_data in raw_data.items():
                 scaler_column_name = column_name + "[{0:d}]".format(scale_id)
-                dataframe_dict[scaler_column_name] = pd.Series(data=scaler_data, index=index)
+                column_data.append((scaler_column_name, scaler_data, index))
         else:
             # Raw data for normal TDMS file
             raw_data = channel.read_data(scaled=False)
-            dataframe_dict[column_name] = pd.Series(data=_array_for_pd(raw_data), index=index)
+            column_data.append((column_name, _array_for_pd(raw_data), index))
+
+    dataframe_dict = OrderedDict()
+    if arrow_dtypes:
+        import pyarrow as pa
+
+        for column_name, data, index in column_data:
+            # Let arrow deduce data types from the numpy dtypes
+            if index is not None:
+                index_array = pa.array(index)
+                index = pd.Index(index_array, dtype=pd.ArrowDtype(index_array.type))
+            data_array = pa.array(data)
+            dataframe_dict[column_name] = pd.Series(
+                data=data_array, dtype=pd.ArrowDtype(data_array.type), index=index)
+    else:
+        for column_name, data, index in column_data:
+            dataframe_dict[column_name] = pd.Series(data=data, index=index)
+
     return pd.DataFrame.from_dict(dataframe_dict)
 
 
 def _array_for_pd(array):
     """ Convert data array to a format suitable for a Pandas dataframe
     """
-    if np.issubdtype(array.dtype, np.dtype('void')):
+    if np.issubdtype(array.dtype, np.dtype('void')) and len(array.dtype) == 0:
         # If dtype is void then the array must also be empty.
         # Pandas doesn't like void data types, so these are converted to empty float64 arrays
         # and Pandas will fill values with NaN

diff --git a/nptdms/tdms.py b/nptdms/tdms.py
@@ -160,7 +160,7 @@ def properties(self):
 
         return self._properties
 
-    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
+    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
         """
         Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
 
@@ -170,11 +170,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
         :param scaled_data: By default the scaled data will be used.
             Set to False to use raw unscaled data.
             For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
+        :param arrow_dtypes: Use PyArrow data types in the DataFrame.
         :return: The full TDMS file data.
         :rtype: pandas.DataFrame
         """
 
-        return pandas_export.from_tdms_file(self, time_index, absolute_time, scaled_data)
+        return pandas_export.from_tdms_file(
+            self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+            arrow_dtypes=arrow_dtypes)
 
     def as_hdf(self, filepath, mode='w', group='/'):
         """
@@ -388,7 +391,7 @@ def channels(self):
         """
         return list(self._channels.values())
 
-    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
+    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
         """
         Converts the TDMS group to a DataFrame. DataFrame columns are named using the channel names.
 
@@ -398,11 +401,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
         :param scaled_data: By default the scaled data will be used.
             Set to False to use raw unscaled data.
             For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
+        :param arrow_dtypes: Use PyArrow data types in the DataFrame.
         :return: The TDMS object data.
         :rtype: pandas.DataFrame
         """
 
-        return pandas_export.from_group(self, time_index, absolute_time, scaled_data)
+        return pandas_export.from_group(
+            self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+            arrow_dtypes=arrow_dtypes)
 
     def __len__(self):
         """ Returns the number of channels in this group
@@ -692,7 +698,7 @@ def time_track(self, absolute_time=False, accuracy='ns'):
         return (start_time +
                 (relative_time * unit_correction).astype(time_type))
 
-    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
+    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
         """
         Converts the TDMS channel to a DataFrame. The DataFrame column is named using the channel path.
 
@@ -702,11 +708,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
         :param scaled_data: By default the scaled data will be used.
             Set to False to use raw unscaled data.
             For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
+        :param arrow_dtypes: Use PyArrow data types in the DataFrame.
         :return: The TDMS object data.
         :rtype: pandas.DataFrame
         """
 
-        return pandas_export.from_channel(self, time_index, absolute_time, scaled_data)
+        return pandas_export.from_channel(
+            self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+            arrow_dtypes=arrow_dtypes)
 
     def _read_data_values(self):
         for chunk in self.data_chunks():

diff --git a/nptdms/test/test_pandas.py b/nptdms/test/test_pandas.py
@@ -189,7 +189,7 @@ def test_file_as_dataframe_with_absolute_time():
     df = tdms_data.as_dataframe(time_index=True, absolute_time=True)
 
     expected_start = datetime(2015, 9, 8, 10, 5, 49)
-    assert (df.index == expected_start)[0]
+    assert (df.index[0] == expected_start)
 
 
 @pytest.mark.parametrize('lazy_load', [True, False])
@@ -321,6 +321,117 @@ def test_raw_daqmx_channel_export(lazy_load):
     np.testing.assert_equal(dataframe["/'Group'/'Channel1'[1]"], expected_data[1])
 
 
+@pytest.mark.parametrize('abs_time_index', [False, True])
+def test_dataframe_with_arrow_types(abs_time_index):
+    test_file = GeneratedFile()
+    test_file.add_segment(*timed_segment())
+
+    tdms_data = test_file.load()
+
+    file_df = tdms_data.as_dataframe(
+        arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)
+
+    group_df = tdms_data['Group'].as_dataframe(
+        arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)
+
+    channel_df = tdms_data['Group']['Channel1'].as_dataframe(
+        arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)
+
+    assert len(file_df) == 2
+    assert "/'Group'/'Channel1'" in file_df.keys()
+    assert "/'Group'/'Channel2'" in file_df.keys()
+
+    def check_series(series):
+        assert (series == [1, 2]).all()
+        assert series.dtype == "int32[pyarrow]"
+        if abs_time_index:
+            assert series.index.dtype == "timestamp[ns][pyarrow]"
+
+    check_series(file_df["/'Group'/'Channel1'"])
+    check_series(group_df['Channel1'])
+    check_series(channel_df["/'Group'/'Channel1'"])
+
+
+@pytest.mark.parametrize('arrow_dtypes', [False, True])
+def test_bool_data_to_pandas(arrow_dtypes):
+    test_file, expected_data = scenarios.bool_data().values
+    df = test_file.load()['group'].as_dataframe(arrow_dtypes=arrow_dtypes)
+    np.testing.assert_allclose(df['bool_channel'], expected_data[('group', 'bool_channel')])
+
+
+@pytest.mark.parametrize('arrow_dtypes', [False, True])
+def test_string_data_to_pandas(arrow_dtypes):
+    strings = ["abcdefg", "qwertyuiop"]
+
+    test_file = GeneratedFile()
+    toc = ("kTocMetaData", "kTocRawData", "kTocNewObjList")
+    metadata = (
+        # Number of objects
+        "01 00 00 00"
+        # Length of the object path
+        "18 00 00 00")
+    metadata += string_hexlify("/'Group'/'StringChannel'")
+    metadata += (
+        # Length of index information
+        "1C 00 00 00"
+        # Raw data data type
+        "20 00 00 00"
+        # Dimension
+        "01 00 00 00"
+        # Number of raw data values
+        "02 00 00 00"
+        "00 00 00 00"
+        # Number of bytes in data
+        "19 00 00 00"
+        "00 00 00 00"
+        # Number of properties (0)
+        "00 00 00 00")
+    data = (
+        "07 00 00 00"  # index to after first string
+        "11 00 00 00"  # index to after second string
+    )
+    for string in strings:
+        data += string_hexlify(string)
+    test_file.add_segment(toc, metadata, data)
+    tdms_data = test_file.load()
+
+    series = tdms_data["Group"].as_dataframe(arrow_dtypes=arrow_dtypes)["StringChannel"]
+
+    assert len(series) == len(strings)
+    for expected, read in zip(strings, series):
+        assert expected == read
+
+
+def test_dataframe_with_complex_data():
+    test_file, expected_data = scenarios.complex_data().values
+    df = test_file.load()['group'].as_dataframe()
+    np.testing.assert_allclose(df['complex_single_channel'], expected_data[('group', 'complex_single_channel')])
+    np.testing.assert_allclose(df['complex_double_channel'], expected_data[('group', 'complex_double_channel')])
+
+
+def test_dataframe_with_raw_timestamp_data():
+    test_file = GeneratedFile()
+    seconds = 3672033330
+    second_fractions = 1234567890 * 10 ** 10
+    test_file.add_segment(
+        ("kTocMetaData", "kTocRawData", "kTocNewObjList"),
+        segment_objects_metadata(
+            channel_metadata("/'group'/'channel1'", 0x44, 4),
+        ),
+        hexlify_value("<Q", 0) + hexlify_value("<q", seconds) +
+        hexlify_value("<Q", second_fractions) + hexlify_value("<q", seconds) +
+        hexlify_value("<Q", 0) + hexlify_value("<q", seconds + 1) +
+        hexlify_value("<Q", second_fractions) + hexlify_value("<q", seconds + 1)
+    )
+
+    with test_file.get_tempfile() as temp_file:
+        tdms_data = TdmsFile.read(temp_file.file, raw_timestamps=True)
+        with pytest.raises(ValueError) as exc_info:
+            tdms_data['group'].as_dataframe()
+        message = str(exc_info.value)
+        assert "compound dtype" in message
+
+
 def test_export_with_empty_channels():
     """Convert a group to dataframe when a channel has empty data and void data type"""
 

diff --git a/setup.cfg b/setup.cfg
@@ -42,6 +42,7 @@ test =
     scipy
 pandas =
     pandas
+    pyarrow
 hdf =
     h5py >= 2.10.0
 thermocouple_scaling =