Skip to content

Commit

Permalink
chore(datasets): Remove deprecation warning from Table Dataset (#956)
Browse files Browse the repository at this point in the history
* remove deprecation warning from table dataset

* fix lint

* remove release note

* remove filepath, file_format and load_args

* fix tests

* adjust file permissions

* fix lint

* address PR comments

* fix tests

* restore load_args

* fix lint

* fix tests

* add load_args check

* address PR comments
  • Loading branch information
ravi-kumar-pilla authored Dec 17, 2024
1 parent 9054d99 commit 83dc20f
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 65 deletions.
1 change: 1 addition & 0 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
## Breaking Changes

- Demoted `video.VideoDataset` from core to experimental dataset.
- Removed file handling capabilities from `ibis.TableDataset`. Use `ibis.FileDataset` to load and save files with an Ibis backend instead.

## Community contributions

Expand Down
44 changes: 6 additions & 38 deletions kedro-datasets/kedro_datasets/ibis/table_dataset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
"""Provide data loading and saving functionality for Ibis's backends."""
from __future__ import annotations

import warnings
from copy import deepcopy
from typing import TYPE_CHECKING, Any, ClassVar

import ibis.expr.types as ir
from kedro.io import AbstractDataset, DatasetError
from kedro.io import AbstractDataset

from kedro_datasets import KedroDeprecationWarning
from kedro_datasets._utils import ConnectionMixin

if TYPE_CHECKING:
Expand Down Expand Up @@ -76,16 +74,14 @@ class TableDataset(ConnectionMixin, AbstractDataset[ir.Table, ir.Table]):
def __init__( # noqa: PLR0913
self,
*,
filepath: str | None = None,
file_format: str | None = None,
table_name: str | None = None,
table_name: str,
database: str | None = None,
connection: dict[str, Any] | None = None,
load_args: dict[str, Any] | None = None,
save_args: dict[str, Any] | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
"""Creates a new ``TableDataset`` pointing to a table (or file).
"""Creates a new ``TableDataset`` pointing to a table.
``TableDataset`` connects to the Ibis backend object constructed
from the connection configuration. The `backend` key provided in
Expand All @@ -95,8 +91,7 @@ def __init__( # noqa: PLR0913
`ibis.duckdb.connect() <https://ibis-project.org/backends/duckdb\
#ibis.duckdb.connect>`_).
If ``table_name`` is given (and ``filepath`` isn't), the dataset
establishes a connection to the relevant table for the execution
The dataset establishes a connection to the relevant table for the execution
backend. Therefore, Ibis doesn't fetch data on load; all compute
is deferred until materialization, when the expression is saved.
In practice, this happens when another ``TableDataset`` instance
Expand All @@ -122,22 +117,7 @@ def __init__( # noqa: PLR0913
metadata: Any arbitrary metadata. This is ignored by Kedro,
but may be consumed by users or external plugins.
"""
if filepath is None and table_name is None:
raise DatasetError(
"Must provide at least one of `filepath` or `table_name`."
)

if filepath is not None or file_format is not None:
warnings.warn(
"Use 'FileDataset' to load and save files with an Ibis "
"backend; the functionality will be removed from 'Table"
"Dataset' in Kedro-Datasets 6.0.0",
KedroDeprecationWarning,
stacklevel=2,
)

self._filepath = filepath
self._file_format = file_format

self._table_name = table_name
self._database = database
self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG
Expand Down Expand Up @@ -171,19 +151,9 @@ def connection(self) -> BaseBackend:
return self._connection

def load(self) -> ir.Table:
if self._filepath is not None:
if self._file_format is None:
raise NotImplementedError

reader = getattr(self.connection, f"read_{self._file_format}")
return reader(self._filepath, self._table_name, **self._load_args)
else:
return self.connection.table(self._table_name, **self._load_args)
return self.connection.table(self._table_name, **self._load_args)

def save(self, data: ir.Table) -> None:
if self._table_name is None:
raise DatasetError("Must provide `table_name` for materialization.")

writer = getattr(self.connection, f"create_{self._materialized}")
writer(self._table_name, data, **self._save_args)

Expand All @@ -193,8 +163,6 @@ def _describe(self) -> dict[str, Any]:
load_args.pop("database", None)
save_args.pop("database", None)
return {
"filepath": self._filepath,
"file_format": self._file_format,
"table_name": self._table_name,
"database": self._database,
"backend": self._connection_config["backend"],
Expand Down
33 changes: 6 additions & 27 deletions kedro-datasets/tests/ibis/test_table_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import duckdb
import ibis
import pytest
from kedro.io import DatasetError
from packaging.version import Version
from pandas.testing import assert_frame_equal

Expand Down Expand Up @@ -48,19 +47,8 @@ def table_dataset(database_name, connection_config, load_args, save_args):


@pytest.fixture
def table_dataset_from_csv(filepath_csv, connection_config, load_args, save_args):
return TableDataset(
filepath=filepath_csv,
file_format="csv",
connection=connection_config,
load_args=load_args,
save_args=save_args,
)


@pytest.fixture
def dummy_table(table_dataset_from_csv):
return table_dataset_from_csv.load()
def dummy_table():
return ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})


@pytest.fixture
Expand Down Expand Up @@ -112,10 +100,11 @@ def test_exists(self, table_dataset, dummy_table):
table_dataset.save(dummy_table)
assert table_dataset.exists()

@pytest.mark.parametrize("load_args", [{"filename": True}], indirect=True)
def test_load_extra_params(self, table_dataset_from_csv, load_args):
@pytest.mark.parametrize("load_args", [{"database": "test"}], indirect=True)
def test_load_extra_params(self, table_dataset, load_args):
"""Test overriding the default load arguments."""
assert "filename" in table_dataset_from_csv.load()
for key, value in load_args.items():
assert table_dataset._load_args[key] == value

@pytest.mark.parametrize("save_args", [{"materialized": "table"}], indirect=True)
def test_save_extra_params(self, table_dataset, save_args, dummy_table, database):
Expand Down Expand Up @@ -172,16 +161,6 @@ def test_database(
"test" in con.sql("SELECT * FROM duckdb_views").fetchnumpy()["schema_name"]
)

def test_no_filepath_or_table_name(connection_config):
pattern = r"Must provide at least one of `filepath` or `table_name`\."
with pytest.raises(DatasetError, match=pattern):
TableDataset(connection=connection_config)

def test_save_no_table_name(self, table_dataset_from_csv, dummy_table):
pattern = r"Must provide `table_name` for materialization\."
with pytest.raises(DatasetError, match=pattern):
table_dataset_from_csv.save(dummy_table)

@pytest.mark.parametrize(
("connection_config", "key"),
[
Expand Down

0 comments on commit 83dc20f

Please sign in to comment.