Skip to content

Commit

Permalink
Get table or column data with a segmented index (#116)
Browse files Browse the repository at this point in the history
* utils.to_segmented_index(): add files_duration

* Table.get(): add as_segmented

* Column.get(): add as_segmented

* TST: update

* DOC: update

* DOC: fix docstring

* DOC: fix docstring

* DOC: update usage example

* Update audformat/core/table.py

Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>

* DOC: update docstring

* TST: update

Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
  • Loading branch information
frankenjoe and hagenw authored Nov 10, 2021
1 parent ea1800c commit e7b9d75
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 7 deletions.
34 changes: 33 additions & 1 deletion audformat/core/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ def get(
*,
map: str = None,
copy: bool = True,
as_segmented: bool = False,
allow_nat: bool = True,
root: str = None,
num_workers: typing.Optional[int] = 1,
verbose: bool = False,
) -> pd.Series:
r"""Get labels.
Expand All @@ -149,11 +154,30 @@ def get(
assigned to a scheme that contains a dict mapping
speaker IDs to age entries, ``map='age'``
will replace the ID values with the age of the speaker
as_segmented: if set to ``True``
and column has a filewise index,
the index of the returned column
will be converted to a segmented index.
``start`` will be set to ``0`` and
``end`` to ``NaT`` or to the file duration
if ``allow_nat`` is set to ``False``
allow_nat: if set to ``False``,
``end=NaT`` is replaced with file duration
root: root directory under which the files are stored.
Provide if file names are relative and
database was not saved or loaded from disk.
If ``None`` :attr:`audformat.Database.root` is used.
Only relevant if ``allow_nat`` is set to ``False``
num_workers: number of parallel jobs.
If ``None`` will be set to the number of processors
on the machine multiplied by 5
verbose: show progress bar
Returns:
labels
Raises:
FileNotFoundError: if file is not found
RuntimeError: if column is not assigned to a table
ValueError: if trying to map without a scheme
ValueError: if trying to map from a scheme that has no labels
Expand All @@ -165,7 +189,15 @@ def get(
'Column is not assigned to a table.'
)

result = self._table.get(index, copy=False)
result = self._table.get(
index,
copy=False,
as_segmented=as_segmented,
allow_nat=allow_nat,
root=root,
num_workers=num_workers,
verbose=verbose,
)
result = result[self._id]

if map is not None:
Expand Down
50 changes: 50 additions & 0 deletions audformat/core/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ class Table(HeaderBase):
file
f1 0
f2 1
>>> table.get(as_segmented=True)
values
file start end
f1 0 days NaT 0
f2 0 days NaT 1
f3 0 days NaT 2
>>> index_ex = filewise_index('f4')
>>> table_ex = table.extend_index(
... index_ex,
Expand Down Expand Up @@ -465,6 +471,11 @@ def get(
str, typing.Union[str, typing.Sequence[str]]
] = None,
copy: bool = True,
as_segmented: bool = False,
allow_nat: bool = True,
root: str = None,
num_workers: typing.Optional[int] = 1,
verbose: bool = False,
) -> pd.DataFrame:
r"""Get labels.
Expand All @@ -487,11 +498,30 @@ def get(
values to age and gender, respectively.
To also keep the original column with speaker IDS, you can do
``map={'speaker': ['speaker', 'age', 'gender']}``
as_segmented: if set to ``True``
and table has a filewise index,
the index of the returned table
will be converted to a segmented index.
``start`` will be set to ``0`` and
``end`` to ``NaT`` or to the file duration
if ``allow_nat`` is set to ``False``
allow_nat: if set to ``False``,
``end=NaT`` is replaced with file duration
root: root directory under which the files are stored.
Provide if file names are relative and
database was not saved or loaded from disk.
If ``None`` :attr:`audformat.Database.root` is used.
Only relevant if ``allow_nat`` is set to ``False``
num_workers: number of parallel jobs.
If ``None`` will be set to the number of processors
on the machine multiplied by 5
verbose: show progress bar
Returns:
labels
Raises:
FileNotFoundError: if file is not found
RuntimeError: if table is not assign to a database
ValueError: if trying to map without a scheme
ValueError: if trying to map from a scheme that has no labels
Expand Down Expand Up @@ -545,6 +575,26 @@ def get(
if column not in mapped_columns:
result.drop(columns=column, inplace=True)

# if necessary, convert to segmented index and replace NaT
is_segmented = index_type(result.index) == define.IndexType.SEGMENTED
if (
(not is_segmented and as_segmented)
or (is_segmented and not allow_nat)
):
files_duration = None
if self.db is not None:
files_duration = self.db._files_duration
root = root or self.db.root
new_index = utils.to_segmented_index(
result.index,
allow_nat=allow_nat,
files_duration=files_duration,
root=root,
num_workers=num_workers,
verbose=verbose,
)
result = result.set_axis(new_index)

return result.copy() if (copy and not result_is_copy) else result

def load(
Expand Down
24 changes: 21 additions & 3 deletions audformat/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,7 @@ def to_segmented_index(
obj: typing.Union[pd.Index, pd.Series, pd.DataFrame],
*,
allow_nat: bool = True,
files_duration: typing.MutableMapping[str, pd.Timedelta] = None,
root: str = None,
num_workers: typing.Optional[int] = 1,
verbose: bool = False,
Expand All @@ -739,6 +740,13 @@ def to_segmented_index(
:ref:`table specifications <data-tables:Tables>`
allow_nat: if set to ``False``, ``end=NaT`` is replaced with file
duration
files_duration: mapping from file to duration.
If not ``None``,
used to look up durations.
If no entry is found for a file,
it is added to the mapping.
Expects absolute file names.
Only relevant if ``allow_nat`` is set to ``False``
root: root directory under which the files referenced in the index
are stored
num_workers: number of parallel jobs.
Expand Down Expand Up @@ -783,16 +791,26 @@ def to_segmented_index(
files = index.get_level_values(define.IndexField.FILE)
starts = index.get_level_values(define.IndexField.START)

def job(file: str) -> float:
def job(file: str) -> pd.Timedelta:

if root is not None and not os.path.isabs(file):
file = os.path.join(root, file)
if files_duration is not None and file in files_duration:
return files_duration[file]

if not os.path.exists(file):
raise FileNotFoundError(
errno.ENOENT,
os.strerror(errno.ENOENT),
file,
)
return audiofile.duration(file)
dur = audiofile.duration(file)
dur = pd.to_timedelta(dur, unit='s')

if files_duration is not None:
files_duration[file] = dur

return dur

params = [([file], {}) for file in files[idx_nat]]
durs = audeer.run_tasks(
Expand All @@ -802,7 +820,7 @@ def job(file: str) -> float:
progress_bar=verbose,
task_description='Read duration',
)
ends.values[idx_nat] = pd.to_timedelta(durs, unit='s')
ends.values[idx_nat] = durs

index = segmented_index(files, starts, ends)

Expand Down
6 changes: 6 additions & 0 deletions docs/data-tables.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ Access labels as :class:`pandas.Series`

db['filewise']['values'].get()

Access labels with a segmented index:

.. jupyter-execute::

db['filewise']['values'].get(as_segmented=True)

Create a segmented index:

.. jupyter-execute::
Expand Down
37 changes: 37 additions & 0 deletions tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,43 @@ def test_filewise(num_files, values):
column.set([], index=index)


def test_get_as_segmented():

db = pytest.DB

y = db['files']['bool'].get()
assert audformat.index_type(y) == audformat.define.IndexType.FILEWISE
assert not db._files_duration

# convert to segmented index

y = db['files']['bool'].get(
as_segmented=True,
allow_nat=True,
)
assert audformat.index_type(y) == audformat.define.IndexType.SEGMENTED
assert not db._files_duration
assert y.index.get_level_values(
audformat.define.IndexField.END
).isna().all()

# replace NaT with file duration

y = db['files']['bool'].get(
as_segmented=True,
allow_nat=False,
)
assert audformat.index_type(y) == audformat.define.IndexType.SEGMENTED
assert db._files_duration
assert not y.index.get_level_values(
audformat.define.IndexField.END
).isna().any()

# reset db

db._files_duration = {}


@pytest.mark.parametrize(
'column, map',
[
Expand Down
5 changes: 5 additions & 0 deletions tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ def test_files_duration():
}
assert db._files_duration == expected_cache

# reset db

db._files_duration = {}
db._root = root


@pytest.mark.parametrize(
'license, license_url, expected_license, expected_url',
Expand Down
37 changes: 37 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,43 @@ def test_filewise(num_files, values):
pd.testing.assert_frame_equal(table.get(), df)


def test_get_as_segmented():

db = pytest.DB

df = db['files'].get()
assert audformat.index_type(df) == audformat.define.IndexType.FILEWISE
assert not db._files_duration

# convert to segmented index

df = db['files'].get(
as_segmented=True,
allow_nat=True,
)
assert audformat.index_type(df) == audformat.define.IndexType.SEGMENTED
assert not db._files_duration
assert df.index.get_level_values(
audformat.define.IndexField.END
).isna().all()

# replace NaT with file duration

df = db['files'].get(
as_segmented=True,
allow_nat=False,
)
assert audformat.index_type(df) == audformat.define.IndexType.SEGMENTED
assert db._files_duration
assert not df.index.get_level_values(
audformat.define.IndexField.END
).isna().any()

# reset db

db._files_duration = {}


def test_load(tmpdir):
# Test backward compatibility
table_file = os.path.join(tmpdir, 'db.table.pkl')
Expand Down
Loading

0 comments on commit e7b9d75

Please sign in to comment.