Add benchmarks for KedroDataCatalog and fix tests for DataCatalog (…

…#4246) * Update DataCatalog benchmark tests Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Add tests for KedroDataCatalog first pass Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Add benchmarks for KedroDataCatalog Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Add suggested tests Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> * Add suggested tests Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com> --------- Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
kedro-org · Oct 21, 2024 · 9a0a779 · 9a0a779
1 parent 3818a2a
commit 9a0a779
Show file tree

Hide file tree

Showing 2 changed files with 136 additions and 2 deletions.
diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py
@@ -62,11 +62,15 @@ def time_release(self):
 
     def time_add_all(self):
         """Benchmark the time to add all datasets"""
-        self.catalog.add_all(self.datasets)
+        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
+        catalog = DataCatalog.from_config(base_catalog)
+        catalog.add_all(self.datasets)
 
     def time_feed_dict(self):
         """Benchmark the time to add feed dict"""
-        self.catalog.add_feed_dict(self.feed_dict)
+        # Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
+        catalog = DataCatalog.from_config(base_catalog)
+        catalog.add_feed_dict(self.feed_dict)
 
     def time_list(self):
         """Benchmark the time to list all datasets"""

diff --git a/benchmarks/benchmark_kedrodatacatalog.py b/benchmarks/benchmark_kedrodatacatalog.py
@@ -0,0 +1,130 @@
+import pandas as pd
+from kedro_datasets.pandas import CSVDataset
+
+from kedro.io import KedroDataCatalog
+
+base_catalog = {
+    f"dataset_{i}": {
+        "type": "pandas.CSVDataset",
+        "filepath": f"data_{i}.csv",
+    } for i in range(1, 1001)
+}
+# Add datasets with the same filepath for loading
+base_catalog.update({
+    f"dataset_load_{i}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "data.csv",
+    } for i in range(1, 1001)
+})
+# Add a factory pattern
+base_catalog.update({
+    "dataset_factory_{placeholder}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "data_{placeholder}.csv",
+    }
+})
+
+runtime_patterns = {
+    "{placeholder}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "{placeholder}.csv",
+    }
+}
+
+class TimeKedroDataCatalog:
+    def setup(self):
+        self.catalog = KedroDataCatalog.from_config(base_catalog)
+        self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
+        self.dataframe.to_csv("data.csv", index=False)
+        self.datasets = {
+            f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
+        }
+        self.feed_dict = {
+            f"param_{i}": i for i in range(1, 1001)
+        }
+
+    def time_init(self):
+        """Benchmark the time to initialize the catalog"""
+        KedroDataCatalog.from_config(base_catalog)
+
+    def time_contains(self):
+        """Benchmark the time to check if a dataset exists"""
+        for i in range(1,1001):
+            f"dataset_{i}" in self.catalog
+
+    def time_getitem(self):
+        """Benchmark the time to get a dataset"""
+        for i in range(1,1001):
+            self.catalog[f"dataset_{i}"]
+
+
+    def time_get(self):
+        """Benchmark the time to get a dataset"""
+        for i in range(1,1001):
+            self.catalog.get(f"dataset_{i}")
+
+    def time_iter(self):
+        """Benchmark the time to iterate over the catalog"""
+        for dataset in self.catalog:
+            pass
+
+    def time_keys(self):
+        """Benchmark the time to get the keys of the catalog"""
+        self.catalog.keys()
+
+    def time_values(self):
+        """Benchmark the time to get the items of the catalog"""
+        self.catalog.values()
+
+    def time_items(self):
+        """Benchmark the time to get the items of the catalog"""
+        self.catalog.items()
+
+    def time_setitem(self):
+        """Benchmark the time to set a dataset"""
+        for i in range(1,1001):
+            self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv")
+
+    def time_setitem_raw(self):
+        """Benchmark the time to add a memory dataset"""
+        for i in range(1,1001):
+            self.catalog[f"param_{i}"] = self.feed_dict[f"param_{i}"]
+
+    def time_save(self):
+        """Benchmark the time to save datasets"""
+        for i in range(1,1001):
+            self.catalog.save(f"dataset_{i}", self.dataframe)
+
+    def time_load(self):
+        """Benchmark the time to load datasets"""
+        for i in range(1,1001):
+            self.catalog.load(f"dataset_load_{i}")
+
+    def time_exists(self):
+        """Benchmark the time to check if datasets exist"""
+        for i in range(1,1001):
+            self.catalog.exists(f"dataset_{i}")
+
+    def time_release(self):
+        """Benchmark the time to release datasets"""
+        for i in range(1,1001):
+            self.catalog.release(f"dataset_{i}")
+
+    def time_list(self):
+        """Benchmark the time to list all datasets"""
+        self.catalog.list()
+
+    def time_shallow_copy(self):
+        """Benchmark the time to shallow copy the catalog"""
+        # Will be removed
+        self.catalog.shallow_copy()
+
+    def time_resolve_factory(self):
+        """Benchmark the time to resolve factory"""
+        for i in range(1,1001):
+            self.catalog.get(f"dataset_factory_{i}")
+
+    def time_add_runtime_patterns(self):
+        """Benchmark the time to add runtime patterns"""
+        for i in range(1,1001):
+            self.catalog.config_resolver.add_runtime_patterns(runtime_patterns)