From 4d54cbb2b78ab0af442be8cb2ca5f1d6c1bb53d2 Mon Sep 17 00:00:00 2001 From: Duncan Blythe Date: Thu, 4 Jul 2024 16:08:57 +0200 Subject: [PATCH 1/2] Add data-init component (cherry picked from commit 4f57a509c747e66e933203bd7f389e16ff045c17) --- superduperdb/components/component.py | 2 +- superduperdb/components/dataset.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/superduperdb/components/component.py b/superduperdb/components/component.py index 2ca2850b4..9ab1ec783 100644 --- a/superduperdb/components/component.py +++ b/superduperdb/components/component.py @@ -113,7 +113,7 @@ class Component(Leaf): _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, 'DataType']]] = () set_post_init: t.ClassVar[t.Sequence] = ('version',) changed: t.ClassVar[set] = set([]) - + upstream: t.Optional[t.List["Component"]] = None artifacts: dc.InitVar[t.Optional[t.Dict]] = None @property diff --git a/superduperdb/components/dataset.py b/superduperdb/components/dataset.py index 57398d22b..7ad147630 100644 --- a/superduperdb/components/dataset.py +++ b/superduperdb/components/dataset.py @@ -94,3 +94,13 @@ def __str__(self): return f'Dataset(identifier={self.identifier}, select={self.select})' __repr__ = __str__ + + +class DataInit(Component): + data: t.List[t.Dict] + table: str + + def post_create(self, db: Datalayer) -> None: + super().post_create(db) + self.init() + db[self.table].insert(self.data).execute() From 09d07141065638effd4d7e11320e2faf866be246 Mon Sep 17 00:00:00 2001 From: JieguangZhou Date: Tue, 9 Jul 2024 15:04:25 +0800 Subject: [PATCH 2/2] Add documentation and UTs for the DataInit component --- CHANGELOG.md | 1 + docs/content/api/components/dataset.md | 29 +++++++++++++++++++++++-- docs/content/apply_api/data_init.md | 17 +++++++++++++++ superduperdb/components/component.py | 1 + superduperdb/components/dataset.py | 14 +++++++++++- test/unittest/component/test_dataset.py | 17 ++++++++++++++- 6 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 docs/content/apply_api/data_init.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b2f2a710..c3cfffe37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - QueryTemplate component - Support for packaging application from the database. +- Added DataInit component #### Bug Fixes diff --git a/docs/content/api/components/dataset.md b/docs/content/api/components/dataset.md index e9c7bc590..14e9da2cf 100644 --- a/docs/content/api/components/dataset.md +++ b/docs/content/api/components/dataset.md @@ -8,14 +8,16 @@ Dataset(self, identifier: str, db: dataclasses.InitVar[typing.Optional[ForwardRef('Datalayer')]] = None, - uuid: str = , + uuid: None = , *, + upstream: "t.Optional[t.List['Component']]" = None, artifacts: 'dc.InitVar[t.Optional[t.Dict]]' = None, select: 't.Optional[Query]' = None, sample_size: 't.Optional[int]' = None, random_seed: 't.Optional[int]' = None, creation_date: 't.Optional[str]' = None, - raw_data: 't.Optional[t.Sequence[t.Any]]' = None) -> None + raw_data: 't.Optional[t.Sequence[t.Any]]' = None, + pin: 'bool' = False) -> None ``` | Parameter | Description | |-----------|-------------| @@ -28,6 +30,29 @@ Dataset(self, | random_seed | The random seed to use for sampling. | | creation_date | The date the dataset was created. | | raw_data | The raw data for the dataset. | +| pin | Whether to pin the dataset. If True, the dataset will load the datas from the database every time. If False, the dataset will cache the datas after we apply to db. | A dataset is an immutable collection of documents. +## `DataInit` + +```python +DataInit(self, + identifier: str, + db: dataclasses.InitVar[typing.Optional[ForwardRef('Datalayer')]] = None, + uuid: None = , + *, + upstream: "t.Optional[t.List['Component']]" = None, + artifacts: 'dc.InitVar[t.Optional[t.Dict]]' = None, + data: 't.List[t.Dict]', + table: 'str') -> None +``` +| Parameter | Description | +|-----------|-------------| +| identifier | Identifier of the leaf. | +| db | Datalayer instance. | +| uuid | UUID of the leaf. | +| artifacts | A dictionary of artifacts paths and `DataType` objects | + +DataInit(identifier: str, db: dataclasses.InitVar[typing.Optional[ForwardRef('Datalayer')]] = None, uuid: None = , *, upstream: "t.Optional[t.List['Component']]" = None, artifacts: 'dc.InitVar[t.Optional[t.Dict]]' = None, data: 't.List[t.Dict]', table: 'str') + diff --git a/docs/content/apply_api/data_init.md b/docs/content/apply_api/data_init.md new file mode 100644 index 000000000..2f42d1684 --- /dev/null +++ b/docs/content/apply_api/data_init.md @@ -0,0 +1,17 @@ +# `DataInit` + +- Used to automatically insert initialization data during application build. + +***Usage pattern*** + +```python +from superduperdb.components.dataset import DataInit +data = [{"x": i, "y": [1, 2, 3]} for i in range(10)] +data_init = DataInit(data=data, table="documents", identifier="test_data_init") + +db.apply(data_init) +``` + +***Explanation*** + +- When db.apply(data_init) is executed, DataInit inserts data into the specified table. \ No newline at end of file diff --git a/superduperdb/components/component.py b/superduperdb/components/component.py index 9ab1ec783..70e8ce7a7 100644 --- a/superduperdb/components/component.py +++ b/superduperdb/components/component.py @@ -106,6 +106,7 @@ class Component(Leaf): that can be saved into a database. :param artifacts: A dictionary of artifacts paths and `DataType` objects + :param upstream: A list of upstream components """ type_id: t.ClassVar[str] = 'component' diff --git a/superduperdb/components/dataset.py b/superduperdb/components/dataset.py index 7ad147630..6c116a144 100644 --- a/superduperdb/components/dataset.py +++ b/superduperdb/components/dataset.py @@ -97,10 +97,22 @@ def __str__(self): class DataInit(Component): + """A data initialization component. + + :param data: The data to initialize. + :param table: The table to insert the data. + """ + data: t.List[t.Dict] table: str def post_create(self, db: Datalayer) -> None: - super().post_create(db) + """Called after the first time this component is created. + + Generally used if ``self.version`` is important in this logic. + + :param db: the db that creates the component. + """ + super().post_create(db) self.init() db[self.table].insert(self.data).execute() diff --git a/test/unittest/component/test_dataset.py b/test/unittest/component/test_dataset.py index 7fce2dfb6..284afcf64 100644 --- a/test/unittest/component/test_dataset.py +++ b/test/unittest/component/test_dataset.py @@ -2,7 +2,7 @@ import pytest -from superduperdb.components.dataset import Dataset +from superduperdb.components.dataset import DataInit, Dataset @pytest.mark.parametrize("db", DBConfig.EMPTY_CASES, indirect=True) @@ -32,3 +32,18 @@ def test_dataset_pin(db, pin): len(dataset.data) == 10 else: len(dataset.data) == 20 + + +@pytest.mark.parametrize("db", DBConfig.EMPTY_CASES, indirect=True) +def test_init_data(db): + db.cfg.auto_schema = True + data = [{"x": i, "y": [1, 2, 3]} for i in range(10)] + data_init = DataInit(data=data, table="documents", identifier="test_data_init") + + db.apply(data_init) + + data = list(db["documents"].select().execute()) + assert len(data) == 10 + for i, d in enumerate(data): + assert d["x"] == i + assert d["y"] == [1, 2, 3]