Skip to content

Commit

Permalink
Add benchmarks for KedroDataCatalog and fix tests for DataCatalog (
Browse files Browse the repository at this point in the history
…#4246)

* Update DataCatalog benchmark tests

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add tests for KedroDataCatalog first pass

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add benchmarks for KedroDataCatalog

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add suggested tests

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add suggested tests

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

---------

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
  • Loading branch information
ankatiyar authored Oct 21, 2024
1 parent 3818a2a commit 9a0a779
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 2 deletions.
8 changes: 6 additions & 2 deletions benchmarks/benchmark_datacatalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,15 @@ def time_release(self):

def time_add_all(self):
"""Benchmark the time to add all datasets"""
self.catalog.add_all(self.datasets)
# Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
catalog = DataCatalog.from_config(base_catalog)
catalog.add_all(self.datasets)

def time_feed_dict(self):
"""Benchmark the time to add feed dict"""
self.catalog.add_feed_dict(self.feed_dict)
# Have to initialise a new DataCatalog to avoid failing with DatasetAlreadyExistsError
catalog = DataCatalog.from_config(base_catalog)
catalog.add_feed_dict(self.feed_dict)

def time_list(self):
"""Benchmark the time to list all datasets"""
Expand Down
130 changes: 130 additions & 0 deletions benchmarks/benchmark_kedrodatacatalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
from kedro_datasets.pandas import CSVDataset

from kedro.io import KedroDataCatalog

base_catalog = {
f"dataset_{i}": {
"type": "pandas.CSVDataset",
"filepath": f"data_{i}.csv",
} for i in range(1, 1001)
}
# Add datasets with the same filepath for loading
base_catalog.update({
f"dataset_load_{i}": {
"type": "pandas.CSVDataset",
"filepath": "data.csv",
} for i in range(1, 1001)
})
# Add a factory pattern
base_catalog.update({
"dataset_factory_{placeholder}": {
"type": "pandas.CSVDataset",
"filepath": "data_{placeholder}.csv",
}
})

runtime_patterns = {
"{placeholder}": {
"type": "pandas.CSVDataset",
"filepath": "{placeholder}.csv",
}
}

class TimeKedroDataCatalog:
def setup(self):
self.catalog = KedroDataCatalog.from_config(base_catalog)
self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
self.dataframe.to_csv("data.csv", index=False)
self.datasets = {
f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
}
self.feed_dict = {
f"param_{i}": i for i in range(1, 1001)
}

def time_init(self):
"""Benchmark the time to initialize the catalog"""
KedroDataCatalog.from_config(base_catalog)

def time_contains(self):
"""Benchmark the time to check if a dataset exists"""
for i in range(1,1001):
f"dataset_{i}" in self.catalog

def time_getitem(self):
"""Benchmark the time to get a dataset"""
for i in range(1,1001):
self.catalog[f"dataset_{i}"]


def time_get(self):
"""Benchmark the time to get a dataset"""
for i in range(1,1001):
self.catalog.get(f"dataset_{i}")

def time_iter(self):
"""Benchmark the time to iterate over the catalog"""
for dataset in self.catalog:
pass

def time_keys(self):
"""Benchmark the time to get the keys of the catalog"""
self.catalog.keys()

def time_values(self):
"""Benchmark the time to get the items of the catalog"""
self.catalog.values()

def time_items(self):
"""Benchmark the time to get the items of the catalog"""
self.catalog.items()

def time_setitem(self):
"""Benchmark the time to set a dataset"""
for i in range(1,1001):
self.catalog[f"dataset_new_{i}"] = CSVDataset(filepath="data.csv")

def time_setitem_raw(self):
"""Benchmark the time to add a memory dataset"""
for i in range(1,1001):
self.catalog[f"param_{i}"] = self.feed_dict[f"param_{i}"]

def time_save(self):
"""Benchmark the time to save datasets"""
for i in range(1,1001):
self.catalog.save(f"dataset_{i}", self.dataframe)

def time_load(self):
"""Benchmark the time to load datasets"""
for i in range(1,1001):
self.catalog.load(f"dataset_load_{i}")

def time_exists(self):
"""Benchmark the time to check if datasets exist"""
for i in range(1,1001):
self.catalog.exists(f"dataset_{i}")

def time_release(self):
"""Benchmark the time to release datasets"""
for i in range(1,1001):
self.catalog.release(f"dataset_{i}")

def time_list(self):
"""Benchmark the time to list all datasets"""
self.catalog.list()

def time_shallow_copy(self):
"""Benchmark the time to shallow copy the catalog"""
# Will be removed
self.catalog.shallow_copy()

def time_resolve_factory(self):
"""Benchmark the time to resolve factory"""
for i in range(1,1001):
self.catalog.get(f"dataset_factory_{i}")

def time_add_runtime_patterns(self):
"""Benchmark the time to add runtime patterns"""
for i in range(1,1001):
self.catalog.config_resolver.add_runtime_patterns(runtime_patterns)

0 comments on commit 9a0a779

Please sign in to comment.