superduper-io · blythed · Jul 9, 2024 · Jul 4, 2024 · Jul 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - QueryTemplate component
 - Support for packaging application from the database.
+- Added DataInit component
 
 #### Bug Fixes
 

diff --git a/docs/content/api/components/dataset.md b/docs/content/api/components/dataset.md
@@ -8,14 +8,16 @@
 Dataset(self,
      identifier: str,
      db: dataclasses.InitVar[typing.Optional[ForwardRef('Datalayer')]] = None,
-     uuid: str = <factory>,
+     uuid: None = <factory>,
      *,
+     upstream: "t.Optional[t.List['Component']]" = None,
      artifacts: 'dc.InitVar[t.Optional[t.Dict]]' = None,
      select: 't.Optional[Query]' = None,
      sample_size: 't.Optional[int]' = None,
      random_seed: 't.Optional[int]' = None,
      creation_date: 't.Optional[str]' = None,
-     raw_data: 't.Optional[t.Sequence[t.Any]]' = None) -> None
+     raw_data: 't.Optional[t.Sequence[t.Any]]' = None,
+     pin: 'bool' = False) -> None
 ```
 | Parameter | Description |
 |-----------|-------------|
@@ -28,6 +30,29 @@ Dataset(self,
 | random_seed | The random seed to use for sampling. |
 | creation_date | The date the dataset was created. |
 | raw_data | The raw data for the dataset. |
+| pin | Whether to pin the dataset. If True, the dataset will load the datas from the database every time. If False, the dataset will cache the datas after we apply to db. |
 
 A dataset is an immutable collection of documents.
 
+## `DataInit` 
+
+```python
+DataInit(self,
+     identifier: str,
+     db: dataclasses.InitVar[typing.Optional[ForwardRef('Datalayer')]] = None,
+     uuid: None = <factory>,
+     *,
+     upstream: "t.Optional[t.List['Component']]" = None,
+     artifacts: 'dc.InitVar[t.Optional[t.Dict]]' = None,
+     data: 't.List[t.Dict]',
+     table: 'str') -> None
+```
+| Parameter | Description |
+|-----------|-------------|
+| identifier | Identifier of the leaf. |
+| db | Datalayer instance. |
+| uuid | UUID of the leaf. |
+| artifacts | A dictionary of artifacts paths and `DataType` objects |
+
+DataInit(identifier: str, db: dataclasses.InitVar[typing.Optional[ForwardRef('Datalayer')]] = None, uuid: None = <factory>, *, upstream: "t.Optional[t.List['Component']]" = None, artifacts: 'dc.InitVar[t.Optional[t.Dict]]' = None, data: 't.List[t.Dict]', table: 'str')
+
diff --git a/docs/content/apply_api/data_init.md b/docs/content/apply_api/data_init.md
@@ -0,0 +1,17 @@
+# `DataInit`
+
+- Used to automatically insert initialization data during application build.
+
+***Usage pattern***
+
+```python
+from superduperdb.components.dataset import DataInit
+data = [{"x": i, "y": [1, 2, 3]} for i in range(10)]
+data_init = DataInit(data=data, table="documents", identifier="test_data_init")
+
+db.apply(data_init)
+```
+
+***Explanation***
+
+- When db.apply(data_init) is executed, DataInit inserts data into the specified table.
diff --git a/superduperdb/components/component.py b/superduperdb/components/component.py
@@ -106,14 +106,15 @@ class Component(Leaf):
     that can be saved into a database.
 
     :param artifacts: A dictionary of artifacts paths and `DataType` objects
+    :param upstream: A list of upstream components
     """
 
     type_id: t.ClassVar[str] = 'component'
     leaf_type: t.ClassVar[str] = 'component'
     _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, 'DataType']]] = ()
     set_post_init: t.ClassVar[t.Sequence] = ('version',)
     changed: t.ClassVar[set] = set([])
-
+    upstream: t.Optional[t.List["Component"]] = None
     artifacts: dc.InitVar[t.Optional[t.Dict]] = None
 
     @property

diff --git a/superduperdb/components/dataset.py b/superduperdb/components/dataset.py
@@ -94,3 +94,25 @@ def __str__(self):
         return f'Dataset(identifier={self.identifier}, select={self.select})'
 
     __repr__ = __str__
+
+
+class DataInit(Component):
+    """A data initialization component.
+
+    :param data: The data to initialize.
+    :param table: The table to insert the data.
+    """
+
+    data: t.List[t.Dict]
+    table: str
+
+    def post_create(self, db: Datalayer) -> None:
+        """Called after the first time this component is created.
+
+        Generally used if ``self.version`` is important in this logic.
+
+        :param db: the db that creates the component.
+        """
+        super().post_create(db)
+        self.init()
+        db[self.table].insert(self.data).execute()
diff --git a/test/unittest/component/test_dataset.py b/test/unittest/component/test_dataset.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from superduperdb.components.dataset import Dataset
+from superduperdb.components.dataset import DataInit, Dataset
 
 
 @pytest.mark.parametrize("db", DBConfig.EMPTY_CASES, indirect=True)
@@ -32,3 +32,18 @@ def test_dataset_pin(db, pin):
         len(dataset.data) == 10
     else:
         len(dataset.data) == 20
+
+
+@pytest.mark.parametrize("db", DBConfig.EMPTY_CASES, indirect=True)
+def test_init_data(db):
+    db.cfg.auto_schema = True
+    data = [{"x": i, "y": [1, 2, 3]} for i in range(10)]
+    data_init = DataInit(data=data, table="documents", identifier="test_data_init")
+
+    db.apply(data_init)
+
+    data = list(db["documents"].select().execute())
+    assert len(data) == 10
+    for i, d in enumerate(data):
+        assert d["x"] == i
+        assert d["y"] == [1, 2, 3]