Get table or column data with a segmented index (#116)

* utils.to_segmented_index(): add files_duration * Table.get(): add as_segmented * Column.get(): add as_segmented * TST: update * DOC: update * DOC: fix docstring * DOC: fix docstring * DOC: update usage example * Update audformat/core/table.py Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com> * DOC: update docstring * TST: update Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
audeering · Nov 10, 2021 · e7b9d75 · e7b9d75
1 parent ea1800c
commit e7b9d75
Show file tree

Hide file tree

Showing 8 changed files with 253 additions and 7 deletions.
diff --git a/audformat/core/column.py b/audformat/core/column.py
@@ -131,6 +131,11 @@ def get(
             *,
             map: str = None,
             copy: bool = True,
+            as_segmented: bool = False,
+            allow_nat: bool = True,
+            root: str = None,
+            num_workers: typing.Optional[int] = 1,
+            verbose: bool = False,
     ) -> pd.Series:
         r"""Get labels.
 
@@ -149,11 +154,30 @@ def get(
                 assigned to a scheme that contains a dict mapping
                 speaker IDs to age entries, ``map='age'``
                 will replace the ID values with the age of the speaker
+            as_segmented: if set to ``True``
+                and column has a filewise index,
+                the index of the returned column
+                will be converted to a segmented index.
+                ``start`` will be set to ``0`` and
+                ``end`` to ``NaT`` or to the file duration
+                if ``allow_nat`` is set to ``False``
+            allow_nat: if set to ``False``,
+                ``end=NaT`` is replaced with file duration
+            root: root directory under which the files are stored.
+                Provide if file names are relative and
+                database was not saved or loaded from disk.
+                If ``None`` :attr:`audformat.Database.root` is used.
+                Only relevant if ``allow_nat`` is set to ``False``
+            num_workers: number of parallel jobs.
+                If ``None`` will be set to the number of processors
+                on the machine multiplied by 5
+            verbose: show progress bar
 
         Returns:
             labels
 
         Raises:
+            FileNotFoundError: if file is not found
             RuntimeError: if column is not assigned to a table
             ValueError: if trying to map without a scheme
             ValueError: if trying to map from a scheme that has no labels
@@ -165,7 +189,15 @@ def get(
                 'Column is not assigned to a table.'
             )
 
-        result = self._table.get(index, copy=False)
+        result = self._table.get(
+            index,
+            copy=False,
+            as_segmented=as_segmented,
+            allow_nat=allow_nat,
+            root=root,
+            num_workers=num_workers,
+            verbose=verbose,
+        )
         result = result[self._id]
 
         if map is not None:

diff --git a/audformat/core/table.py b/audformat/core/table.py
@@ -80,6 +80,12 @@ class Table(HeaderBase):
         file
         f1        0
         f2        1
+        >>> table.get(as_segmented=True)
+                        values
+        file start  end
+        f1   0 days NaT      0
+        f2   0 days NaT      1
+        f3   0 days NaT      2
         >>> index_ex = filewise_index('f4')
         >>> table_ex = table.extend_index(
         ...     index_ex,
@@ -465,6 +471,11 @@ def get(
                 str, typing.Union[str, typing.Sequence[str]]
             ] = None,
             copy: bool = True,
+            as_segmented: bool = False,
+            allow_nat: bool = True,
+            root: str = None,
+            num_workers: typing.Optional[int] = 1,
+            verbose: bool = False,
     ) -> pd.DataFrame:
         r"""Get labels.
 
@@ -487,11 +498,30 @@ def get(
                 values to age and gender, respectively.
                 To also keep the original column with speaker IDS, you can do
                 ``map={'speaker': ['speaker', 'age', 'gender']}``
+            as_segmented: if set to ``True``
+                and table has a filewise index,
+                the index of the returned table
+                will be converted to a segmented index.
+                ``start`` will be set to ``0`` and
+                ``end`` to ``NaT`` or to the file duration
+                if ``allow_nat`` is set to ``False``
+            allow_nat: if set to ``False``,
+                ``end=NaT`` is replaced with file duration
+            root: root directory under which the files are stored.
+                Provide if file names are relative and
+                database was not saved or loaded from disk.
+                If ``None`` :attr:`audformat.Database.root` is used.
+                Only relevant if ``allow_nat`` is set to ``False``
+            num_workers: number of parallel jobs.
+                If ``None`` will be set to the number of processors
+                on the machine multiplied by 5
+            verbose: show progress bar
 
         Returns:
             labels
 
         Raises:
+            FileNotFoundError: if file is not found
             RuntimeError: if table is not assign to a database
             ValueError: if trying to map without a scheme
             ValueError: if trying to map from a scheme that has no labels
@@ -545,6 +575,26 @@ def get(
                 if column not in mapped_columns:
                     result.drop(columns=column, inplace=True)
 
+        # if necessary, convert to segmented index and replace NaT
+        is_segmented = index_type(result.index) == define.IndexType.SEGMENTED
+        if (
+                (not is_segmented and as_segmented)
+                or (is_segmented and not allow_nat)
+        ):
+            files_duration = None
+            if self.db is not None:
+                files_duration = self.db._files_duration
+                root = root or self.db.root
+            new_index = utils.to_segmented_index(
+                result.index,
+                allow_nat=allow_nat,
+                files_duration=files_duration,
+                root=root,
+                num_workers=num_workers,
+                verbose=verbose,
+            )
+            result = result.set_axis(new_index)
+
         return result.copy() if (copy and not result_is_copy) else result
 
     def load(

diff --git a/audformat/core/utils.py b/audformat/core/utils.py
@@ -716,6 +716,7 @@ def to_segmented_index(
         obj: typing.Union[pd.Index, pd.Series, pd.DataFrame],
         *,
         allow_nat: bool = True,
+        files_duration: typing.MutableMapping[str, pd.Timedelta] = None,
         root: str = None,
         num_workers: typing.Optional[int] = 1,
         verbose: bool = False,
@@ -739,6 +740,13 @@ def to_segmented_index(
             :ref:`table specifications <data-tables:Tables>`
         allow_nat: if set to ``False``, ``end=NaT`` is replaced with file
             duration
+        files_duration: mapping from file to duration.
+            If not ``None``,
+            used to look up durations.
+            If no entry is found for a file,
+            it is added to the mapping.
+            Expects absolute file names.
+            Only relevant if ``allow_nat`` is set to ``False``
         root: root directory under which the files referenced in the index
             are stored
         num_workers: number of parallel jobs.
@@ -783,16 +791,26 @@ def to_segmented_index(
             files = index.get_level_values(define.IndexField.FILE)
             starts = index.get_level_values(define.IndexField.START)
 
-            def job(file: str) -> float:
+            def job(file: str) -> pd.Timedelta:
+
                 if root is not None and not os.path.isabs(file):
                     file = os.path.join(root, file)
+                if files_duration is not None and file in files_duration:
+                    return files_duration[file]
+
                 if not os.path.exists(file):
                     raise FileNotFoundError(
                         errno.ENOENT,
                         os.strerror(errno.ENOENT),
                         file,
                     )
-                return audiofile.duration(file)
+                dur = audiofile.duration(file)
+                dur = pd.to_timedelta(dur, unit='s')
+
+                if files_duration is not None:
+                    files_duration[file] = dur
+
+                return dur
 
             params = [([file], {}) for file in files[idx_nat]]
             durs = audeer.run_tasks(
@@ -802,7 +820,7 @@ def job(file: str) -> float:
                 progress_bar=verbose,
                 task_description='Read duration',
             )
-            ends.values[idx_nat] = pd.to_timedelta(durs, unit='s')
+            ends.values[idx_nat] = durs
 
             index = segmented_index(files, starts, ends)
 

diff --git a/docs/data-tables.rst b/docs/data-tables.rst
@@ -86,6 +86,12 @@ Access labels as :class:`pandas.Series`
 
     db['filewise']['values'].get()
 
+Access labels with a segmented index:
+
+.. jupyter-execute::
+
+    db['filewise']['values'].get(as_segmented=True)
+
 Create a segmented index:
 
 .. jupyter-execute::

diff --git a/tests/test_column.py b/tests/test_column.py
@@ -115,6 +115,43 @@ def test_filewise(num_files, values):
         column.set([], index=index)
 
 
+def test_get_as_segmented():
+
+    db = pytest.DB
+
+    y = db['files']['bool'].get()
+    assert audformat.index_type(y) == audformat.define.IndexType.FILEWISE
+    assert not db._files_duration
+
+    # convert to segmented index
+
+    y = db['files']['bool'].get(
+        as_segmented=True,
+        allow_nat=True,
+    )
+    assert audformat.index_type(y) == audformat.define.IndexType.SEGMENTED
+    assert not db._files_duration
+    assert y.index.get_level_values(
+        audformat.define.IndexField.END
+    ).isna().all()
+
+    # replace NaT with file duration
+
+    y = db['files']['bool'].get(
+        as_segmented=True,
+        allow_nat=False,
+    )
+    assert audformat.index_type(y) == audformat.define.IndexType.SEGMENTED
+    assert db._files_duration
+    assert not y.index.get_level_values(
+        audformat.define.IndexField.END
+    ).isna().any()
+
+    # reset db
+
+    db._files_duration = {}
+
+
 @pytest.mark.parametrize(
     'column, map',
     [

diff --git a/tests/test_database.py b/tests/test_database.py
@@ -218,6 +218,11 @@ def test_files_duration():
     }
     assert db._files_duration == expected_cache
 
+    # reset db
+
+    db._files_duration = {}
+    db._root = root
+
 
 @pytest.mark.parametrize(
     'license, license_url, expected_license, expected_url',

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -806,6 +806,43 @@ def test_filewise(num_files, values):
     pd.testing.assert_frame_equal(table.get(), df)
 
 
+def test_get_as_segmented():
+
+    db = pytest.DB
+
+    df = db['files'].get()
+    assert audformat.index_type(df) == audformat.define.IndexType.FILEWISE
+    assert not db._files_duration
+
+    # convert to segmented index
+
+    df = db['files'].get(
+        as_segmented=True,
+        allow_nat=True,
+    )
+    assert audformat.index_type(df) == audformat.define.IndexType.SEGMENTED
+    assert not db._files_duration
+    assert df.index.get_level_values(
+        audformat.define.IndexField.END
+    ).isna().all()
+
+    # replace NaT with file duration
+
+    df = db['files'].get(
+        as_segmented=True,
+        allow_nat=False,
+    )
+    assert audformat.index_type(df) == audformat.define.IndexType.SEGMENTED
+    assert db._files_duration
+    assert not df.index.get_level_values(
+        audformat.define.IndexField.END
+    ).isna().any()
+
+    # reset db
+
+    db._files_duration = {}
+
+
 def test_load(tmpdir):
     # Test backward compatibility
     table_file = os.path.join(tmpdir, 'db.table.pkl')