diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 518bbdbd8..b48bcce39 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -25,6 +25,7 @@ ## Breaking Changes - Demoted `video.VideoDataset` from core to experimental dataset. +- Removed file handling capabilities from `ibis.TableDataset`. Use `ibis.FileDataset` to load and save files with an Ibis backend instead. ## Community contributions diff --git a/kedro-datasets/kedro_datasets/ibis/table_dataset.py b/kedro-datasets/kedro_datasets/ibis/table_dataset.py index be09c6ddc..646c86791 100644 --- a/kedro-datasets/kedro_datasets/ibis/table_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/table_dataset.py @@ -1,14 +1,12 @@ """Provide data loading and saving functionality for Ibis's backends.""" from __future__ import annotations -import warnings from copy import deepcopy from typing import TYPE_CHECKING, Any, ClassVar import ibis.expr.types as ir -from kedro.io import AbstractDataset, DatasetError +from kedro.io import AbstractDataset -from kedro_datasets import KedroDeprecationWarning from kedro_datasets._utils import ConnectionMixin if TYPE_CHECKING: @@ -76,16 +74,14 @@ class TableDataset(ConnectionMixin, AbstractDataset[ir.Table, ir.Table]): def __init__( # noqa: PLR0913 self, *, - filepath: str | None = None, - file_format: str | None = None, - table_name: str | None = None, + table_name: str, database: str | None = None, connection: dict[str, Any] | None = None, load_args: dict[str, Any] | None = None, save_args: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None, ) -> None: - """Creates a new ``TableDataset`` pointing to a table (or file). + """Creates a new ``TableDataset`` pointing to a table. ``TableDataset`` connects to the Ibis backend object constructed from the connection configuration. The `backend` key provided in @@ -95,8 +91,7 @@ def __init__( # noqa: PLR0913 `ibis.duckdb.connect() `_). - If ``table_name`` is given (and ``filepath`` isn't), the dataset - establishes a connection to the relevant table for the execution + The dataset establishes a connection to the relevant table for the execution backend. Therefore, Ibis doesn't fetch data on load; all compute is deferred until materialization, when the expression is saved. In practice, this happens when another ``TableDataset`` instance @@ -122,22 +117,7 @@ def __init__( # noqa: PLR0913 metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ - if filepath is None and table_name is None: - raise DatasetError( - "Must provide at least one of `filepath` or `table_name`." - ) - - if filepath is not None or file_format is not None: - warnings.warn( - "Use 'FileDataset' to load and save files with an Ibis " - "backend; the functionality will be removed from 'Table" - "Dataset' in Kedro-Datasets 6.0.0", - KedroDeprecationWarning, - stacklevel=2, - ) - - self._filepath = filepath - self._file_format = file_format + self._table_name = table_name self._database = database self._connection_config = connection or self.DEFAULT_CONNECTION_CONFIG @@ -171,19 +151,9 @@ def connection(self) -> BaseBackend: return self._connection def load(self) -> ir.Table: - if self._filepath is not None: - if self._file_format is None: - raise NotImplementedError - - reader = getattr(self.connection, f"read_{self._file_format}") - return reader(self._filepath, self._table_name, **self._load_args) - else: - return self.connection.table(self._table_name, **self._load_args) + return self.connection.table(self._table_name, **self._load_args) def save(self, data: ir.Table) -> None: - if self._table_name is None: - raise DatasetError("Must provide `table_name` for materialization.") - writer = getattr(self.connection, f"create_{self._materialized}") writer(self._table_name, data, **self._save_args) @@ -193,8 +163,6 @@ def _describe(self) -> dict[str, Any]: load_args.pop("database", None) save_args.pop("database", None) return { - "filepath": self._filepath, - "file_format": self._file_format, "table_name": self._table_name, "database": self._database, "backend": self._connection_config["backend"], diff --git a/kedro-datasets/tests/ibis/test_table_dataset.py b/kedro-datasets/tests/ibis/test_table_dataset.py index 9970caa6a..e3b3828e7 100644 --- a/kedro-datasets/tests/ibis/test_table_dataset.py +++ b/kedro-datasets/tests/ibis/test_table_dataset.py @@ -1,7 +1,6 @@ import duckdb import ibis import pytest -from kedro.io import DatasetError from packaging.version import Version from pandas.testing import assert_frame_equal @@ -48,19 +47,8 @@ def table_dataset(database_name, connection_config, load_args, save_args): @pytest.fixture -def table_dataset_from_csv(filepath_csv, connection_config, load_args, save_args): - return TableDataset( - filepath=filepath_csv, - file_format="csv", - connection=connection_config, - load_args=load_args, - save_args=save_args, - ) - - -@pytest.fixture -def dummy_table(table_dataset_from_csv): - return table_dataset_from_csv.load() +def dummy_table(): + return ibis.memtable({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) @pytest.fixture @@ -112,10 +100,11 @@ def test_exists(self, table_dataset, dummy_table): table_dataset.save(dummy_table) assert table_dataset.exists() - @pytest.mark.parametrize("load_args", [{"filename": True}], indirect=True) - def test_load_extra_params(self, table_dataset_from_csv, load_args): + @pytest.mark.parametrize("load_args", [{"database": "test"}], indirect=True) + def test_load_extra_params(self, table_dataset, load_args): """Test overriding the default load arguments.""" - assert "filename" in table_dataset_from_csv.load() + for key, value in load_args.items(): + assert table_dataset._load_args[key] == value @pytest.mark.parametrize("save_args", [{"materialized": "table"}], indirect=True) def test_save_extra_params(self, table_dataset, save_args, dummy_table, database): @@ -172,16 +161,6 @@ def test_database( "test" in con.sql("SELECT * FROM duckdb_views").fetchnumpy()["schema_name"] ) - def test_no_filepath_or_table_name(connection_config): - pattern = r"Must provide at least one of `filepath` or `table_name`\." - with pytest.raises(DatasetError, match=pattern): - TableDataset(connection=connection_config) - - def test_save_no_table_name(self, table_dataset_from_csv, dummy_table): - pattern = r"Must provide `table_name` for materialization\." - with pytest.raises(DatasetError, match=pattern): - table_dataset_from_csv.save(dummy_table) - @pytest.mark.parametrize( ("connection_config", "key"), [