From 46bb394a500e913cf3fe8ede329deea023c78eb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= <hello@juanlu.space>
Date: Tue, 11 Apr 2023 12:05:04 +0200
Subject: [PATCH 01/96] Fix links on GitHub issue templates (#150)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .github/ISSUE_TEMPLATE/config.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index af7ecdbe0..53557f844 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,10 +1,10 @@
 blank_issues_enabled: false
 contact_links:
-  - name: Discord server
+  - name: Slack workspace
     about: Come chat with the community!
-    url: https://discord.gg/akJDeVaxnB
+    url: https://slack.kedro.org
   - name: Documentation
-    url: https://kedro.readthedocs.io/en/stable/
+    url: https://docs.kedro.org
     about: To learn more about how Kedro works
   - name: Case studies, articles and video tutorials
     url: https://github.com/kedro-org/kedro-community

From c9421aed8f642fa4562b1069ffe00fa08ca9f183 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Wed, 12 Apr 2023 13:41:49 +0100
Subject: [PATCH 02/96] add spark_stream_dataset.py

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_stream_dataset.py             | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
new file mode 100644
index 000000000..6844e04cf
--- /dev/null
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -0,0 +1,128 @@
+"""SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
+from typing import Any, Dict
+
+import pyspark
+import yaml
+from kedro.io import AbstractDataSet
+from pyspark import SparkConf
+from pyspark.sql import SparkSession
+from yaml.loader import SafeLoader
+
+
+class SparkStreamingDataSet(AbstractDataSet):
+    """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
+
+    Example usage for the
+    `YAML API <https://kedro.readthedocs.io/en/stable/data/\
+    data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+    .. code-block:: yaml
+
+        raw.new_inventory:
+            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            filepath: data/01_raw/stream/inventory/
+            file_format: json
+
+        int.new_inventory:
+            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            filepath: data/02_intermediate/inventory/
+            file_format: csv
+            save_args:
+                output_mode: append
+                checkpoint: data/04_checkpoint/int_new_inventory
+                header: True
+            load_args:
+                header: True
+
+    """
+
+    def __init__(
+        self,
+        filepath: str = "",
+        file_format: str = "",
+        save_args: Dict[str, str] = {},
+        load_args: Dict[str, str] = {},
+    ):
+        """Creates a new instance of SparkStreamingDataSet.
+
+        Args:
+            filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks
+                specify ``filepath``s starting with ``/dbfs/``. For message brokers such as
+                Kafka and all filepath is not required.
+            file_format: File format used during load and save
+                operations. These are formats supported by the running
+                SparkContext include parquet, csv, delta. For a list of supported
+                formats please refer to Apache Spark documentation at
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+            load_args: Load args passed to Spark DataFrameReader load method.
+                It is dependent on the selected file format. You can find
+                a list of read options for each supported format
+                in Spark DataFrame read documentation:
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+            save_args: Save args passed to Spark DataFrame write options.
+                Similar to load_args this is dependent on the selected file
+                format. You can pass ``mode`` and ``partitionBy`` to specify
+                your overwrite mode and partitioning respectively. You can find
+                a list of options for each format in Spark DataFrame
+                write documentation:
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+        """
+        self._filepath_ = filepath
+        self.file_format = file_format
+        self._save_args = save_args
+        self._load_args = load_args
+        self.output_format = [
+            "kafka"
+        ]  # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving.
+
+        # read spark configuration from spark yml file and create a spark context
+        with open("conf/base/spark.yml") as f:
+            self.parameters = yaml.load(f, Loader=SafeLoader)
+        self.spark_conf = SparkConf().setAll(self.parameters.items())
+
+        # Initialise the spark session
+        self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf)
+        self.spark = self.spark_session_conf.getOrCreate()
+
+    def _load(self) -> pyspark.sql.DataFrame:
+        """Loads data from filepath.
+        If the connector type is kafka then no file_path is required
+
+        Returns:
+            Data from filepath as pyspark dataframe.
+        """
+        input_constructor = self.spark.readStream.format(self.file_format).options(
+            **self._load_args
+        )
+        return (
+            input_constructor.load()
+            if self.file_format
+            in self.output_format  # if the connector type is message broker
+            else input_constructor.load(self._filepath_)
+        )
+
+    def _save(self, data: pyspark.sql.DataFrame) -> None:
+        """Saves pyspark dataframe.
+
+        Args:
+            data: PySpark streaming dataframe for saving
+
+        """
+
+        output_constructor = data.writeStream.format(self.file_format)
+
+        # for message brokers path is not needed
+        if self.file_format not in self.output_format:
+            output_constructor = output_constructor.option("path", self._filepath_)
+
+        (
+            output_constructor.option(
+                "checkpointLocation", self._save_args.pop("checkpoint")
+            )
+            .outputMode(self._save_args.pop("output_mode"))
+            .options(**self._save_args)
+            .start()
+        )
+
+    def _describe(self) -> Dict[str, Any]:
+        """Returns a dict that describes attributes of the dataset."""
+        return None

From 63f578aceee5aec43b8dde0e60a5c16c49ed2f32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= <hello@juanlu.space>
Date: Wed, 12 Apr 2023 14:22:14 +0200
Subject: [PATCH 03/96] Migrate most of `kedro-datasets` metadata to
 `pyproject.toml` (#161)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Include missing requirements files in sdist

Fix gh-86.

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Migrate most project metadata to `pyproject.toml`

See https://github.com/kedro-org/kedro/issues/2334.

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Move requirements to `pyproject.toml`

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 Makefile                        |  2 +-
 kedro-datasets/pyproject.toml   | 25 +++++++++++++++++++++++
 kedro-datasets/requirements.txt |  1 -
 kedro-datasets/setup.py         | 35 +--------------------------------
 4 files changed, 27 insertions(+), 36 deletions(-)
 delete mode 100644 kedro-datasets/requirements.txt

diff --git a/Makefile b/Makefile
index 86daa6313..be653ed59 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 package:
 	cd $(plugin);\
 	rm -Rf dist;\
-	python setup.py sdist bdist_wheel
+	python -m build
 
 pypi:
 	python -m pip install twine -U
diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
index 6df7bd372..0f0ad2fc3 100644
--- a/kedro-datasets/pyproject.toml
+++ b/kedro-datasets/pyproject.toml
@@ -1,3 +1,28 @@
+[project]
+name = "kedro-datasets"
+authors = [
+    {name = "Kedro"}
+]
+description = "Kedro-Datasets is where you can find all of Kedro's data connectors."
+requires-python = ">=3.7, <3.11"
+license = {text = "Apache Software License (Apache 2.0)"}
+dependencies = [
+    "kedro~=0.18.4",
+]
+dynamic = ["readme", "version", "optional-dependencies"]
+
+[project.urls]
+Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets"
+Documentation = "https://docs.kedro.org"
+Tracker = "https://github.com/kedro-org/kedro-plugins/issues"
+
+[tool.setuptools.packages.find]
+include = ["kedro_datasets*"]
+
+[tool.setuptools.dynamic]
+readme = {file = "README.md", content-type = "text/markdown"}
+version = {attr = "kedro_datasets.__version__"}
+
 [tool.black]
 
 [tool.isort]
diff --git a/kedro-datasets/requirements.txt b/kedro-datasets/requirements.txt
deleted file mode 100644
index b5edbb617..000000000
--- a/kedro-datasets/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-kedro~=0.18.4
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index 6d88fe50e..4840b8535 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -1,12 +1,6 @@
-import re
-from codecs import open
 from itertools import chain
-from os import path
 
-from setuptools import find_packages, setup
-
-name = "kedro-datasets"
-here = path.abspath(path.dirname(__file__))
+from setuptools import setup
 
 # at least 1.3 to be able to use XMLDataSet and pandas integration with fsspec
 PANDAS = "pandas>=1.3, <3.0"
@@ -15,21 +9,6 @@
 S3FS = "s3fs>=0.3.0, <0.5"
 POLARS = "polars~=0.15.16"
 
-with open("requirements.txt", "r", encoding="utf-8") as f:
-    install_requires = [x.strip() for x in f if x.strip()]
-
-with open("test_requirements.txt", "r", encoding="utf-8") as f:
-    tests_require = [x.strip() for x in f if x.strip() and not x.startswith("-r")]
-
-# get package version
-package_name = name.replace("-", "_")
-with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f:
-    version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1)
-
-# Get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
-    readme = f.read()
-
 
 def _collect_requirements(requires):
     return sorted(set(chain.from_iterable(requires.values())))
@@ -145,17 +124,5 @@ def _collect_requirements(requires):
 extras_require["all"] = _collect_requirements(extras_require)
 
 setup(
-    name=name,
-    version=version,
-    description="Kedro-Datasets is where you can find all of Kedro's data connectors.",
-    long_description=readme,
-    long_description_content_type="text/markdown",
-    url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets",
-    install_requires=install_requires,
-    tests_require=tests_require,
-    author="Kedro",
-    python_requires=">=3.7, <3.11",
-    license="Apache Software License (Apache 2.0)",
-    packages=find_packages(exclude=["tests*"]),
     extras_require=extras_require,
 )

From 4b387ff1a5e44ebded6d70df640ab600999c135b Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 13 Apr 2023 11:48:57 +0100
Subject: [PATCH 04/96] restructure the strean dataset to align with the other
 spark dataset

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_stream_dataset.py             | 57 ++++++++++++++-----
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
index 6844e04cf..0992ab5ce 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -1,13 +1,13 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
 from typing import Any, Dict
-
-import pyspark
+from copy import deepcopy
 import yaml
 from kedro.io import AbstractDataSet
 from pyspark import SparkConf
-from pyspark.sql import SparkSession
+from pathlib import PurePosixPath
+from pyspark.sql import SparkSession, DataFrame
 from yaml.loader import SafeLoader
-
+from kedro_datasets.spark.spark_dataset import _split_filepath
 
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
@@ -35,13 +35,16 @@ class SparkStreamingDataSet(AbstractDataSet):
 
     """
 
+    DEFAULT_LOAD_ARGS = {}  # type: Dict[str, Any]
+    DEFAULT_SAVE_ARGS = {}  # type: Dict[str, Any]
+
     def __init__(
         self,
         filepath: str = "",
         file_format: str = "",
-        save_args: Dict[str, str] = {},
-        load_args: Dict[str, str] = {},
-    ):
+        save_args: Dict[str, Any] = None,
+        load_args: Dict[str, Any] = None,
+    ) -> None:
         """Creates a new instance of SparkStreamingDataSet.
 
         Args:
@@ -74,23 +77,46 @@ def __init__(
             "kafka"
         ]  # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving.
 
+        fs_prefix, filepath = _split_filepath(filepath)
+
+        self._fs_prefix = fs_prefix
+        self._filepath = PurePosixPath(filepath)
+
+        # Handle default load and save arguments
+        self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
+        if load_args is not None:
+            self._load_args.update(load_args)
+        self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
+        if save_args is not None:
+            self._save_args.update(save_args)
+
+    def _describe(self) -> Dict[str, Any]:
+        """Returns a dict that describes attributes of the dataset."""
+        return {
+            "filepath": self._fs_prefix + str(self._filepath),
+            "file_format": self._file_format,
+            "load_args": self._load_args,
+            "save_args": self._save_args,
+        }
+
+    @staticmethod
+    def _get_spark(self):
         # read spark configuration from spark yml file and create a spark context
         with open("conf/base/spark.yml") as f:
             self.parameters = yaml.load(f, Loader=SafeLoader)
         self.spark_conf = SparkConf().setAll(self.parameters.items())
 
         # Initialise the spark session
-        self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf)
-        self.spark = self.spark_session_conf.getOrCreate()
+        return SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
 
-    def _load(self) -> pyspark.sql.DataFrame:
+    def _load(self) -> DataFrame:
         """Loads data from filepath.
         If the connector type is kafka then no file_path is required
 
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = self.spark.readStream.format(self.file_format).options(
+        input_constructor = self._get_spark().readStream.format(self.file_format).options(
             **self._load_args
         )
         return (
@@ -100,7 +126,7 @@ def _load(self) -> pyspark.sql.DataFrame:
             else input_constructor.load(self._filepath_)
         )
 
-    def _save(self, data: pyspark.sql.DataFrame) -> None:
+    def _save(self, data: DataFrame) -> None:
         """Saves pyspark dataframe.
 
         Args:
@@ -123,6 +149,7 @@ def _save(self, data: pyspark.sql.DataFrame) -> None:
             .start()
         )
 
-    def _describe(self) -> Dict[str, Any]:
-        """Returns a dict that describes attributes of the dataset."""
-        return None
+
+
+
+

From 39ad9fd56f116d421b85d0a6e4f5a4b0eface6a1 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 13 Apr 2023 17:43:56 +0100
Subject: [PATCH 05/96] adding README.md for specification

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 38 +++++++++++++++++++
 .../spark/spark_stream_dataset.py             | 17 +++++----
 2 files changed, 48 insertions(+), 7 deletions(-)
 create mode 100644 kedro-datasets/kedro_datasets/spark/README.md

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
new file mode 100644
index 000000000..bded31532
--- /dev/null
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -0,0 +1,38 @@
+# Spark Streaming
+
+``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames.
+See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
+
+To work with multiple streaming nodes, 2 hook are required for: 
+    - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details
+    - Running streaming query without termination unless exception
+
+#### Example SparkStreamsHook:
+
+```python
+from kedro.framework.hooks import hook_impl
+from pyspark.sql import SparkSession
+
+class SparkStreamsHook:
+    @hook_impl
+    def after_pipeline_run(self) -> None:
+        """Starts a spark streaming await session
+        once the pipeline reaches the last node
+        """
+
+        spark = SparkSession.builder.getOrCreate()
+        spark.streams.awaitAnyTermination()
+```
+To make the application work with kafka format, respective spark configuration need to be added in ``conf/base/spark.yml``.
+
+#### Example spark.yml:
+
+```yaml
+spark.driver.maxResultSize: 3g
+spark.scheduler.mode: FAIR
+spark.sql.streaming.schemaInference: True
+spark.streaming.stopGracefullyOnShutdown: true # graceful shutdown guarantees (under some conditions, listed below in the post) that all received data is processed before destroying Spark context
+spark.sql.streaming.stateStore.stateSchemaCheck: false # since schema is not mentioned explicitly
+spark.jars.packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 # spark and kafka configuraton for reading kafka files (not required if kafka is not used)
+
+```
diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
index 0992ab5ce..382c45286 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -1,4 +1,5 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
+import os
 from typing import Any, Dict
 from copy import deepcopy
 import yaml
@@ -101,13 +102,15 @@ def _describe(self) -> Dict[str, Any]:
 
     @staticmethod
     def _get_spark(self):
-        # read spark configuration from spark yml file and create a spark context
-        with open("conf/base/spark.yml") as f:
-            self.parameters = yaml.load(f, Loader=SafeLoader)
-        self.spark_conf = SparkConf().setAll(self.parameters.items())
-
-        # Initialise the spark session
-        return SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
+        spark_conf_path = "conf/base/spark.yml"
+        if os.path.exists(spark_conf_path):
+            with open(spark_conf_path) as f:
+                self.parameters = yaml.load(f, Loader=SafeLoader)
+            self.spark_conf = SparkConf().setAll(self.parameters.items())
+            spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
+        else:
+            spark = SparkSession.builder.getOrCreate()
+        return spark
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.

From 69eb8bea26b750fb9ffeca48ee1220e372547826 Mon Sep 17 00:00:00 2001
From: Tingting Wan <110382691+Tingting711@users.noreply.github.com>
Date: Fri, 14 Apr 2023 01:40:10 +0800
Subject: [PATCH 06/96] Update
 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py

Co-authored-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
index 382c45286..77bf62f40 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -19,7 +19,7 @@ class SparkStreamingDataSet(AbstractDataSet):
     .. code-block:: yaml
 
         raw.new_inventory:
-            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            type: spark.SparkStreamingDataSet
             filepath: data/01_raw/stream/inventory/
             file_format: json
 

From 3106068ce458d81849df4b513662d0088d3a860b Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 13 Apr 2023 18:41:12 +0100
Subject: [PATCH 07/96] rename the dataset

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/{spark_stream_dataset.py => spark_streaming_dataset.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename kedro-datasets/kedro_datasets/spark/{spark_stream_dataset.py => spark_streaming_dataset.py} (100%)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
similarity index 100%
rename from kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
rename to kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py

From b8141a7eef96e140e5abd19e7d9a6a16da0f6a47 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 17 Apr 2023 10:19:43 +0100
Subject: [PATCH 08/96] resolve comments

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/__init__.py                | 2 ++
 kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py
index 3dede09aa..c93d3f0df 100644
--- a/kedro-datasets/kedro_datasets/spark/__init__.py
+++ b/kedro-datasets/kedro_datasets/spark/__init__.py
@@ -12,3 +12,5 @@
     from .spark_jdbc_dataset import SparkJDBCDataSet
 with suppress(ImportError):
     from .deltatable_dataset import DeltaTableDataSet
+with suppress(ImportError):
+    from .spark_streaming_dataset import SparkStreamingDataSet
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 77bf62f40..1ee271e87 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -24,7 +24,7 @@ class SparkStreamingDataSet(AbstractDataSet):
             file_format: json
 
         int.new_inventory:
-            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            type: spark.SparkStreamingDataSet
             filepath: data/02_intermediate/inventory/
             file_format: csv
             save_args:

From 738625e63a48365dc05fe1905cc08db9c40a4aa6 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 17 Apr 2023 11:42:05 +0100
Subject: [PATCH 09/96] fix format and pylint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 1ee271e87..fa6fc9c7e 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -2,14 +2,15 @@
 import os
 from typing import Any, Dict
 from copy import deepcopy
+from pathlib import PurePosixPath
 import yaml
 from kedro.io import AbstractDataSet
 from pyspark import SparkConf
-from pathlib import PurePosixPath
 from pyspark.sql import SparkSession, DataFrame
 from yaml.loader import SafeLoader
 from kedro_datasets.spark.spark_dataset import _split_filepath
 
+
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
 
@@ -71,12 +72,10 @@ def __init__(
                 https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
         """
         self._filepath_ = filepath
-        self.file_format = file_format
+        self._file_format = file_format
         self._save_args = save_args
         self._load_args = load_args
-        self.output_format = [
-            "kafka"
-        ]  # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving.
+        self.output_format = ["kafka"]
 
         fs_prefix, filepath = _split_filepath(filepath)
 
@@ -101,13 +100,15 @@ def _describe(self) -> Dict[str, Any]:
         }
 
     @staticmethod
-    def _get_spark(self):
+    def _get_spark():
         spark_conf_path = "conf/base/spark.yml"
         if os.path.exists(spark_conf_path):
-            with open(spark_conf_path) as f:
-                self.parameters = yaml.load(f, Loader=SafeLoader)
-            self.spark_conf = SparkConf().setAll(self.parameters.items())
-            spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
+            with open(
+                spark_conf_path, encoding="utf-8"
+            ) as File:  # pylint: disable=invalid-name
+                parameters = yaml.load(File, Loader=SafeLoader)
+            spark_conf = SparkConf().setAll(parameters.items())
+            spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
         else:
             spark = SparkSession.builder.getOrCreate()
         return spark
@@ -119,12 +120,14 @@ def _load(self) -> DataFrame:
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = self._get_spark().readStream.format(self.file_format).options(
-            **self._load_args
+        input_constructor = (
+            self._get_spark()
+            .readStream.format(self._file_format)
+            .options(**self._load_args)
         )
         return (
             input_constructor.load()
-            if self.file_format
+            if self._file_format
             in self.output_format  # if the connector type is message broker
             else input_constructor.load(self._filepath_)
         )
@@ -137,10 +140,10 @@ def _save(self, data: DataFrame) -> None:
 
         """
 
-        output_constructor = data.writeStream.format(self.file_format)
+        output_constructor = data.writeStream.format(self._file_format)
 
         # for message brokers path is not needed
-        if self.file_format not in self.output_format:
+        if self._file_format not in self.output_format:
             output_constructor = output_constructor.option("path", self._filepath_)
 
         (
@@ -151,8 +154,3 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
-
-
-
-
-

From a54cc676df0a957c536994084769bc6e72244417 Mon Sep 17 00:00:00 2001
From: Tingting Wan <110382691+Tingting711@users.noreply.github.com>
Date: Mon, 17 Apr 2023 21:21:08 +0800
Subject: [PATCH 10/96] Update kedro-datasets/kedro_datasets/spark/README.md

Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index bded31532..f222df00a 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -1,6 +1,6 @@
 # Spark Streaming
 
-``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames.
+``SparkStreamingDataSet`` loads and saves data to streaming DataFrames.
 See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
 
 To work with multiple streaming nodes, 2 hook are required for: 

From b924ad6e6fc58c7bf8e4556163787350c9d3da80 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Fri, 21 Apr 2023 23:02:52 +0100
Subject: [PATCH 11/96] add unit tests and SparkStreamingDataset in init.py

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/__init__.py          |  2 +-
 .../spark/spark_streaming_dataset.py          | 14 +++++-
 .../spark/test_spark_streaming_dataset.py     | 47 +++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 kedro-datasets/tests/spark/test_spark_streaming_dataset.py

diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py
index c93d3f0df..0c46a7fc3 100644
--- a/kedro-datasets/kedro_datasets/spark/__init__.py
+++ b/kedro-datasets/kedro_datasets/spark/__init__.py
@@ -1,6 +1,6 @@
 """Provides I/O modules for Apache Spark."""
 
-__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet"]
+__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"]
 
 from contextlib import suppress
 
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index fa6fc9c7e..10680d661 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -6,9 +6,10 @@
 import yaml
 from kedro.io import AbstractDataSet
 from pyspark import SparkConf
+from pyspark.errors.exceptions.captured import AnalysisException
 from pyspark.sql import SparkSession, DataFrame
 from yaml.loader import SafeLoader
-from kedro_datasets.spark.spark_dataset import _split_filepath
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
 
 class SparkStreamingDataSet(AbstractDataSet):
@@ -154,3 +155,14 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
+    def _exists(self) -> bool:
+        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+
+        try:
+            self._get_spark().read.load(path=load_path, format="delta")
+        except AnalysisException as exception:
+            if "is not a Delta table" in exception.desc:
+                return False
+            raise
+
+        return True
\ No newline at end of file
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
new file mode 100644
index 000000000..47a427742
--- /dev/null
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -0,0 +1,47 @@
+import pytest
+import time
+from pyspark.sql import SparkSession
+from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+
+@pytest.fixture
+def sample_spark_streaming_df_one():
+    schema = StructType(
+        [
+            StructField("sku", StringType(), True),
+            StructField("new_stock", IntegerType(), True),
+        ]
+    )
+    data = [("0001", 2), ("0001", 7), ("0002", 4)]
+
+    return SparkSession.builder.getOrCreate() \
+            .createDataFrame(data, schema)
+
+
+class TestStreamingDataSet:
+    def test_load(self,tmp_path, sample_spark_streaming_df_one):
+        filepath = (tmp_path / "test_streams").as_posix()
+        spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"])
+        spark_json_ds.save(sample_spark_streaming_df_one)
+        loaded_with_spark = spark_json_ds.load()
+
+        stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load()
+        assert stream_df.isStreaming
+
+        stream_query = stream_df.writeStream.format("memory").queryName("test").start()
+        assert stream_query.isActive
+        time.sleep(3)
+        stream_query.stop()
+        loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test")
+
+        assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0
+
+
+    def test_save(self, tmp_path, sample_spark_df):
+        filepath = (tmp_path / "test_streams").as_posix()
+        checkpoint_path = (tmp_path / "checkpoint").as_posix()
+        streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path])
+        assert not streaming_ds.exists()
+
+

From 743b823110102c36e0f6a665e3718dc4f9eaa5a7 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 25 Apr 2023 10:20:02 +0100
Subject: [PATCH 12/96] add unit tests

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 76 +++++++++++++++----
 .../spark/test_spark_streaming_dataset.py     | 66 ++++++++++------
 2 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 10680d661..a508a3903 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -1,13 +1,17 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
+import json
 import os
 from typing import Any, Dict
 from copy import deepcopy
 from pathlib import PurePosixPath
 import yaml
-from kedro.io import AbstractDataSet
+
+import fsspec
+from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path
 from pyspark import SparkConf
-from pyspark.errors.exceptions.captured import AnalysisException
+from pyspark.sql.utils import AnalysisException
 from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql.types import StructType
 from yaml.loader import SafeLoader
 from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
@@ -91,6 +95,37 @@ def __init__(
         if save_args is not None:
             self._save_args.update(save_args)
 
+        # Handle schema load argument
+        self._schema = self._load_args.pop("schema", None)
+        if self._schema is not None:
+            if isinstance(self._schema, dict):
+                self._schema = self._load_schema_from_file(self._schema)
+
+    @staticmethod
+    def _load_schema_from_file(schema: Dict[str, Any]) -> StructType:
+        filepath = schema.get("filepath")
+        if not filepath:
+            raise DataSetError(
+                "Schema load argument does not specify a 'filepath' attribute. Please"
+                "include a path to a JSON-serialised 'pyspark.sql.types.StructType'."
+            )
+
+        credentials = deepcopy(schema.get("credentials")) or {}
+        protocol, schema_path = get_protocol_and_path(filepath)
+        file_system = fsspec.filesystem(protocol, **credentials)
+        pure_posix_path = PurePosixPath(schema_path)
+        load_path = get_filepath_str(pure_posix_path, protocol)
+
+        # Open schema file
+        with file_system.open(load_path, encoding='utf-8') as fs_file:
+            try:
+                return StructType.fromJson(json.loads(fs_file.read()))
+            except Exception as exc:
+                raise DataSetError(
+                    f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please"
+                    f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'."
+                ) from exc
+
     def _describe(self) -> Dict[str, Any]:
         """Returns a dict that describes attributes of the dataset."""
         return {
@@ -116,16 +151,23 @@ def _get_spark():
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.
-        If the connector type is kafka then no file_path is required
+        If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args
 
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = (
-            self._get_spark()
-            .readStream.format(self._file_format)
-            .options(**self._load_args)
-        )
+        if self._schema:
+            input_constructor = (
+                self._get_spark()
+                .readStream.schema(self._schema).format(self._file_format)
+                .options(**self._load_args)
+            )
+        else:
+            input_constructor = (
+                self._get_spark()
+                .readStream.format(self._file_format)
+                .options(**self._load_args)
+            )
         return (
             input_constructor.load()
             if self._file_format
@@ -155,14 +197,22 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
-    def _exists(self) -> bool:
-        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+    def _exists(self, schema_path:str) -> bool:
+        """Check the existence of pyspark dataframe.
 
+        Args:
+            schema_path: schema of saved streaming dataframe
+        """
+        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+        with open(schema_path, encoding='utf-8') as f:
+            schema = StructType.fromJson(json.loads(f.read()))
         try:
-            self._get_spark().read.load(path=load_path, format="delta")
+            self._get_spark().readStream.schema(schema).load(load_path, self._file_format)
         except AnalysisException as exception:
-            if "is not a Delta table" in exception.desc:
+            if (
+                exception.desc.startswith("Path does not exist:")
+                or "is not a Streaming data" in exception.desc
+            ):
                 return False
             raise
-
         return True
\ No newline at end of file
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 47a427742..2d936b1ce 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,12 +1,27 @@
+import json
 import pytest
 import time
 from pyspark.sql import SparkSession
-from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet
+from kedro_datasets.pandas import ParquetDataSet
+from kedro.io.core import DataSetError
+from kedro_datasets.spark.spark_dataset import SparkDataSet
+from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
 
+
+def sample_schema(schema_path):
+    with open(schema_path, encoding='utf-8') as f:
+        try:
+            return StructType.fromJson(json.loads(f.read()))
+        except Exception as exc:
+            raise DataSetError(
+                f"Contents of 'schema.filepath' ({schema_path}) are invalid. "
+                f"Schema is required for streaming data load, Please provide a valid schema_path."
+            ) from exc
+
 @pytest.fixture
-def sample_spark_streaming_df_one():
+def sample_spark_streaming_df(tmp_path):
     schema = StructType(
         [
             StructField("sku", StringType(), True),
@@ -14,34 +29,41 @@ def sample_spark_streaming_df_one():
         ]
     )
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
-
-    return SparkSession.builder.getOrCreate() \
-            .createDataFrame(data, schema)
+    schema_path = (tmp_path / "test.json").as_posix()
+    with open(schema_path, "w") as f:
+        json.dump(schema.jsonValue(), f)
+    return SparkSession.builder.getOrCreate().createDataFrame(data, schema)
 
 
 class TestStreamingDataSet:
-    def test_load(self,tmp_path, sample_spark_streaming_df_one):
+    def test_load(self, tmp_path, sample_spark_streaming_df):
         filepath = (tmp_path / "test_streams").as_posix()
-        spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"])
-        spark_json_ds.save(sample_spark_streaming_df_one)
-        loaded_with_spark = spark_json_ds.load()
+        schema_path = (tmp_path / "test.json").as_posix()
 
-        stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load()
-        assert stream_df.isStreaming
+        spark_json_ds = SparkDataSet(
+            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+        )
+        spark_json_ds.save(sample_spark_streaming_df)
 
-        stream_query = stream_df.writeStream.format("memory").queryName("test").start()
-        assert stream_query.isActive
-        time.sleep(3)
-        stream_query.stop()
-        loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test")
+        streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json",
+                                          load_args={"schema": {"filepath": schema_path}}).load()
+        assert streaming_ds.isStreaming
+        schema = sample_schema(schema_path)
+        assert streaming_ds.schema == schema
 
-        assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0
+    def test_save(self, tmp_path, sample_spark_streaming_df):
+        filepath = (tmp_path / "test_streams_input").as_posix()
+        schema_path = (tmp_path / "test.json").as_posix()
+        checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
+        spark_json_ds = SparkDataSet(
+            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+        )
+        spark_json_ds.save(sample_spark_streaming_df)
 
-    def test_save(self, tmp_path, sample_spark_df):
-        filepath = (tmp_path / "test_streams").as_posix()
-        checkpoint_path = (tmp_path / "checkpoint").as_posix()
-        streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path])
-        assert not streaming_ds.exists()
+        streaming_ds = SparkStreamingDataSet(
+            filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
+        )
+        assert streaming_ds._exists(schema_path)
 
 

From 3bb371789fd841ac5a3b5ecd98ebb91121b06eae Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 25 Apr 2023 10:41:59 +0100
Subject: [PATCH 13/96] update test_save

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../tests/spark/test_spark_streaming_dataset.py    | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 2d936b1ce..fa3b0fec8 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -52,18 +52,26 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         assert streaming_ds.schema == schema
 
     def test_save(self, tmp_path, sample_spark_streaming_df):
-        filepath = (tmp_path / "test_streams_input").as_posix()
+        filepath_json = (tmp_path / "test_streams").as_posix()
+        filepath_output = (tmp_path / "test_streams_output").as_posix()
         schema_path = (tmp_path / "test.json").as_posix()
         checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
         spark_json_ds = SparkDataSet(
-            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+            filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}]
         )
         spark_json_ds.save(sample_spark_streaming_df)
+        loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json",
+                                          load_args={"schema": {"filepath": schema_path}}).load()
+
 
         streaming_ds = SparkStreamingDataSet(
-            filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
+            filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
         )
+        assert not streaming_ds._exists(schema_path)
+
+        streaming_ds.save(loaded_with_streaming)
         assert streaming_ds._exists(schema_path)
 
 
+

From ae3bc87d1612a0bb2b856ef74e013c1c81de110e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= <hello@juanlu.space>
Date: Mon, 17 Apr 2023 10:48:36 +0200
Subject: [PATCH 14/96] Upgrade Polars (#171)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Upgrade Polars

Signed-off-by: Juan Luis Cano Rodríguez <hello@juanlu.space>

* Update Polars to 0.17.x

---------

Signed-off-by: Juan Luis Cano Rodríguez <hello@juanlu.space>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index 4840b8535..99c30938e 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -7,7 +7,7 @@
 SPARK = "pyspark>=2.2, <4.0"
 HDFS = "hdfs>=2.5.8, <3.0"
 S3FS = "s3fs>=0.3.0, <0.5"
-POLARS = "polars~=0.15.16"
+POLARS = "polars~=0.17.0"
 
 
 def _collect_requirements(requires):

From eb634a100d44a4cca5267ff336f98e434dd3ac14 Mon Sep 17 00:00:00 2001
From: Nok Lam Chan <mediumnok@gmail.com>
Date: Mon, 17 Apr 2023 14:47:16 +0100
Subject: [PATCH 15/96] if release is failed, it return exit code and fail the
 CI (#158)

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 tools/circleci/circleci_release.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py
index 88c4ed1d0..dd05d4c5a 100755
--- a/tools/circleci/circleci_release.py
+++ b/tools/circleci/circleci_release.py
@@ -4,6 +4,7 @@
 """
 
 import os
+import sys
 
 import requests
 from requests.structures import CaseInsensitiveDict
@@ -33,12 +34,6 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke
     headers["Circle-Token"] = circle_release_token
 
     resp = requests.post(circle_endpoint, headers=headers, json=payload, timeout=10)
-    print(f"Status Code: {resp.status_code}")
-    if resp.status_code == 201:
-        print("Creating CircleCI Pipeline successfully")
-        print(resp.content)
-    else:
-        print("Failed to create CircleCI Pipeline")
     return resp
 
 
@@ -70,6 +65,14 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke
 
         print(package_name, package_version)
         if check_no_version_pypi(pypi_endpoint, package_name, package_version):
-            circleci_release(
+            res = circleci_release(
                 PROJECT_SLUG, payload, circleci_endpoint, CIRCLE_RELEASE_TOKEN
             )
+            print(f"Status Code: {resp.status_code}")
+            if resp.status_code == 201:
+                print("Creating CircleCI Pipeline successfully")
+            else:
+                print("Failed to create CircleCI Pipeline")
+            print(resp.content)
+            if resp.status_code != 201:
+                sys.exit(1)

From 115940bd52af08966aa2c80cbe08bbbf2224c381 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= <hello@juanlu.space>
Date: Tue, 18 Apr 2023 13:25:21 +0200
Subject: [PATCH 16/96] Migrate `kedro-airflow` to static metadata (#172)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Migrate kedro-airflow to static metadata

See https://github.com/kedro-org/kedro/issues/2334.

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Add explicit PEP 518 build requirements for kedro-datasets

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Typos

Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com>

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Remove dangling reference to requirements.txt

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Add release notes

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-airflow/MANIFEST.in           |  1 -
 kedro-airflow/RELEASE.md            |  1 +
 kedro-airflow/pyproject.toml        | 48 +++++++++++++++++++++++++++++
 kedro-airflow/requirements.txt      |  3 --
 kedro-airflow/setup.cfg             | 10 ------
 kedro-airflow/setup.py              | 41 ------------------------
 kedro-airflow/test_requirements.txt |  1 -
 kedro-datasets/pyproject.toml       |  4 +++
 8 files changed, 53 insertions(+), 56 deletions(-)
 delete mode 100644 kedro-airflow/requirements.txt
 delete mode 100644 kedro-airflow/setup.cfg
 delete mode 100644 kedro-airflow/setup.py

diff --git a/kedro-airflow/MANIFEST.in b/kedro-airflow/MANIFEST.in
index 523166e84..ed984822f 100644
--- a/kedro-airflow/MANIFEST.in
+++ b/kedro-airflow/MANIFEST.in
@@ -1,4 +1,3 @@
 include README.md
 include LICENSE.md
-include requirements.txt
 include kedro_airflow/airflow_dag_template.j2
diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md
index 75e4654e6..c2e0615b4 100755
--- a/kedro-airflow/RELEASE.md
+++ b/kedro-airflow/RELEASE.md
@@ -1,5 +1,6 @@
 # Upcoming release 0.5.2
 * Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory.
+* Migrate all project metadata to static `pyproject.toml`.
 
 # Release 0.5.1
 * Added additional CLI argument `--jinja-file` to provide a path to a custom Jinja2 template.
diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml
index 4f3292f55..42fe8974b 100644
--- a/kedro-airflow/pyproject.toml
+++ b/kedro-airflow/pyproject.toml
@@ -1,3 +1,51 @@
+[build-system]
+requires = ["setuptools>=61.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kedro-airflow"
+authors = [
+    {name = "Kedro"}
+]
+description = "Kedro-Airflow makes it easy to deploy Kedro projects to Airflow"
+requires-python = ">=3.7, <3.11"
+license = {text = "Apache Software License (Apache 2.0)"}
+dependencies = [
+    "kedro>=0.17.5",
+    "python-slugify>=4.0",
+    "semver~=2.10",  # Needs to be at least 2.10.0 to get VersionInfo.match
+]
+dynamic = ["readme", "version"]
+
+[project.urls]
+Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow"
+Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/README.md"
+Tracker = "https://github.com/kedro-org/kedro-plugins/issues"
+
+[project.entry-points."kedro.project_commands"]
+airflow = "kedro_airflow.plugin:commands"
+
+[tool.setuptools]
+include-package-data = true
+packages = ["kedro_airflow"]
+zip-safe = false
+
+[tool.setuptools.package-data]
+kedro_airflow = ["kedro_airflow/airflow_dag_template.j2"]
+
+[tool.setuptools.dynamic]
+readme = {file = "README.md", content-type = "text/markdown"}
+version = {attr = "kedro_airflow.__version__"}
+
+[tool.pytest.ini_options]
+addopts = """
+    --cov-report xml:coverage.xml
+    --cov-report term-missing
+    --cov kedro_airflow
+    --cov tests
+    --no-cov-on-fail
+    -ra"""
+
 [tool.black]
 exclude=".*template.py"
 
diff --git a/kedro-airflow/requirements.txt b/kedro-airflow/requirements.txt
deleted file mode 100644
index d1731ba85..000000000
--- a/kedro-airflow/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-kedro>=0.17.5
-python-slugify>=4.0
-semver~=2.10 # Needs to be at least 2.10.0 to get VersionInfo.match
diff --git a/kedro-airflow/setup.cfg b/kedro-airflow/setup.cfg
deleted file mode 100644
index 7fa30d2d0..000000000
--- a/kedro-airflow/setup.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-[metadata]
-description-file=README.md
-
-[tool:pytest]
-addopts=--cov-report xml:coverage.xml
-        --cov-report term-missing
-        --cov kedro_airflow
-        --cov tests
-        --no-cov-on-fail
-        -ra
diff --git a/kedro-airflow/setup.py b/kedro-airflow/setup.py
deleted file mode 100644
index 85bb25b8a..000000000
--- a/kedro-airflow/setup.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import re
-from codecs import open
-from os import path
-
-from setuptools import setup
-
-name = "kedro-airflow"
-here = path.abspath(path.dirname(__file__))
-
-# get package version
-package_name = name.replace("-", "_")
-with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f:
-    version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1)
-
-# get the dependencies and installs
-with open("requirements.txt", "r", encoding="utf-8") as f:
-    requires = [x.strip() for x in f if x.strip()]
-
-# get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
-    readme = f.read()
-
-setup(
-    name=name,
-    version=version,
-    description="Kedro-Airflow makes it easy to deploy Kedro projects to Airflow",
-    long_description=readme,
-    long_description_content_type="text/markdown",
-    url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow",
-    author="Kedro",
-    python_requires=">=3.7, <3.11",
-    install_requires=requires,
-    license="Apache Software License (Apache 2.0)",
-    packages=["kedro_airflow"],
-    package_data={"kedro_airflow": ["kedro_airflow/airflow_dag_template.j2"]},
-    include_package_data=True,
-    zip_safe=False,
-    entry_points={
-        "kedro.project_commands": ["airflow = kedro_airflow.plugin:commands"]
-    },
-)
diff --git a/kedro-airflow/test_requirements.txt b/kedro-airflow/test_requirements.txt
index 4ced2ca4c..cdea520c7 100644
--- a/kedro-airflow/test_requirements.txt
+++ b/kedro-airflow/test_requirements.txt
@@ -1,4 +1,3 @@
--r requirements.txt
 apache-airflow<3.0
 bandit>=1.6.2, <2.0
 behave
diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
index 0f0ad2fc3..a5f494106 100644
--- a/kedro-datasets/pyproject.toml
+++ b/kedro-datasets/pyproject.toml
@@ -1,3 +1,7 @@
+[build-system]
+requires = ["setuptools>=61.2"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "kedro-datasets"
 authors = [

From 35231afe22ae19087d347d45a9f7247515b88ca6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= <hello@juanlu.space>
Date: Tue, 18 Apr 2023 13:26:53 +0200
Subject: [PATCH 17/96] Migrate `kedro-telemetry` to static metadata (#174)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Migrate kedro-telemetry to static metadata

See kedro-org/kedro#2334.

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Add release notes

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-telemetry/RELEASE.md            |  3 ++
 kedro-telemetry/pyproject.toml        | 38 +++++++++++++++++++++++++
 kedro-telemetry/requirements.txt      |  2 --
 kedro-telemetry/setup.py              | 41 ---------------------------
 kedro-telemetry/test_requirements.txt |  1 -
 5 files changed, 41 insertions(+), 44 deletions(-)
 delete mode 100644 kedro-telemetry/requirements.txt
 delete mode 100644 kedro-telemetry/setup.py

diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md
index 7cdb93100..bbd32f424 100644
--- a/kedro-telemetry/RELEASE.md
+++ b/kedro-telemetry/RELEASE.md
@@ -1,3 +1,6 @@
+# Upcoming release
+* Migrate all project metadata to static `pyproject.toml`.
+
 # Release 0.2.4
 * Added consent checking for collecting project statistics.
 
diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml
index 07449ad97..0cc754854 100644
--- a/kedro-telemetry/pyproject.toml
+++ b/kedro-telemetry/pyproject.toml
@@ -1,3 +1,41 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kedro-telemetry"
+authors = [
+    {name = "Kedro"}
+]
+description = "Kedro-Telemetry"
+requires-python = ">=3.7, <3.11"
+license = {text = "Apache Software License (Apache 2.0)"}
+dependencies = [
+    "kedro~=0.18.0",
+    "requests~=2.20",
+]
+dynamic = ["readme", "version"]
+
+[project.urls]
+Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry"
+Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-telemetry/README.md"
+Tracker = "https://github.com/kedro-org/kedro-plugins/issues"
+
+[project.entry-points."kedro.cli_hooks"]
+kedro-telemetry = "kedro_telemetry.plugin:cli_hooks"
+
+[project.entry-points."kedro.hooks"]
+kedro-telemetry = "kedro_telemetry.plugin:project_hooks"
+
+[tool.setuptools]
+include-package-data = true
+packages = ["kedro_telemetry"]
+zip-safe = false
+
+[tool.setuptools.dynamic]
+readme = {file = "README.md", content-type = "text/markdown"}
+version = {attr = "kedro_telemetry.__version__"}
+
 [tool.isort]
 multi_line_output = 3
 include_trailing_comma = true
diff --git a/kedro-telemetry/requirements.txt b/kedro-telemetry/requirements.txt
deleted file mode 100644
index c59cb8a9c..000000000
--- a/kedro-telemetry/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-kedro~=0.18.0
-requests~=2.20
diff --git a/kedro-telemetry/setup.py b/kedro-telemetry/setup.py
deleted file mode 100644
index db6a976d2..000000000
--- a/kedro-telemetry/setup.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import re
-from codecs import open
-from os import path
-
-from setuptools import setup
-
-name = "kedro-telemetry"
-here = path.abspath(path.dirname(__file__))
-
-# get package version
-package_name = name.replace("-", "_")
-with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f:
-    version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1)
-
-# get the dependencies and installs
-with open("requirements.txt", "r", encoding="utf-8") as f:
-    requires = [x.strip() for x in f if x.strip()]
-
-# Get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
-    readme = f.read()
-
-setup(
-    name=name,
-    version=version,
-    description="Kedro-Telemetry",
-    long_description=readme,
-    long_description_content_type="text/markdown",
-    url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry",
-    author="Kedro",
-    python_requires=">=3.7, <3.11",
-    install_requires=requires,
-    license="Apache Software License (Apache 2.0)",
-    packages=["kedro_telemetry"],
-    include_package_data=True,
-    zip_safe=False,
-    entry_points={
-        "kedro.cli_hooks": ["kedro-telemetry = kedro_telemetry.plugin:cli_hooks"],
-        "kedro.hooks": ["kedro-telemetry = kedro_telemetry.plugin:project_hooks"]
-    },
-)
diff --git a/kedro-telemetry/test_requirements.txt b/kedro-telemetry/test_requirements.txt
index 4f39e717a..fb187d672 100644
--- a/kedro-telemetry/test_requirements.txt
+++ b/kedro-telemetry/test_requirements.txt
@@ -1,4 +1,3 @@
--r requirements.txt
 bandit>=1.6.2, <2.0
 behave
 black~=22.0

From 8c2ea1bafed6554432f2dfcf4e20df2029d945b0 Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com>
Date: Wed, 19 Apr 2023 15:21:17 +0100
Subject: [PATCH 18/96] ci: port lint, unit test, and e2e tests to Actions
 (#155)

* Add unit test + lint test on GA

* trigger GA - will revert

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Fix lint

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add end to end tests

* Add cache key

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add cache action

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Rename workflow files

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Lint + add comment + default bash

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add windows test

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Update workflow name + revert changes to READMEs

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add kedro-telemetry/RELEASE.md to trufflehog ignore

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Add pytables to test_requirements remove from workflow

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Revert "Add pytables to test_requirements remove from workflow"

This reverts commit 8203daa6405d325c74ec2097c9d0c5859bae8257.

* Separate pip freeze step

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

---------

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .github/workflows/check-plugin.yml    | 134 ++++++++++++++++++++++++++
 .github/workflows/kedro-airflow.yml   |  16 +++
 .github/workflows/kedro-datasets.yml  |  16 +++
 .github/workflows/kedro-docker.yml    |  16 +++
 .github/workflows/kedro-telemetry.yml |  16 +++
 trufflehog-ignore.txt                 |   2 +
 6 files changed, 200 insertions(+)
 create mode 100644 .github/workflows/check-plugin.yml
 create mode 100644 .github/workflows/kedro-airflow.yml
 create mode 100644 .github/workflows/kedro-datasets.yml
 create mode 100644 .github/workflows/kedro-docker.yml
 create mode 100644 .github/workflows/kedro-telemetry.yml

diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml
new file mode 100644
index 000000000..a32c0f651
--- /dev/null
+++ b/.github/workflows/check-plugin.yml
@@ -0,0 +1,134 @@
+name: Running tests and linter
+
+on:
+  workflow_call:
+    inputs:
+      plugin:
+        type: string
+
+jobs:
+  unit-tests:
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, windows-latest ]
+        python-version: [ "3.7", "3.8", "3.9", "3.10" ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python ${{matrix.python-version}}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.python-version}}
+      - name: Cache python packages for Linux
+        if: matrix.os == 'ubuntu-latest'
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}}
+          restore-keys: ${{inputs.plugin}}
+      - name: Cache python packages for Windows
+        if: matrix.os == 'windows-latest'
+        uses: actions/cache@v3
+        with:
+          path: ~\AppData\Local\pip\Cache
+          key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}}
+          restore-keys: ${{inputs.plugin}}
+      - name: Install Kedro
+        run: pip install git+https://github.com/kedro-org/kedro@main
+      - name: Install dependencies
+        run: |
+          cd ${{ inputs.plugin }}
+          pip install -r test_requirements.txt
+      - name: Install pytables (only for kedro-datasets on windows)
+        if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets'
+        run: pip install tables
+      - name: pip freeze
+        run: pip freeze
+      - name: Run unit tests for Linux / all plugins
+        if: matrix.os != 'windows-latest'
+        run: make plugin=${{ inputs.plugin }} test
+      - name: Run unit tests for Windows / kedro-airflow, kedro-docker, kedro-telemetry
+        if: matrix.os == 'windows-latest' && inputs.plugin != 'kedro-datasets'
+        run: |
+          cd ${{ inputs.plugin }}
+          pytest tests
+      - name: Run unit tests for Windows / kedro-datasets / no spark sequential
+        if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' && matrix.python-version == '3.10'
+        run: |
+          make test-no-spark-sequential
+      - name: Run unit tests for Windows / kedro-datasets / no spark parallel
+        if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' && matrix.python-version != '3.10'
+        run: |
+          make test-no-spark
+
+  lint:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v3
+        with:
+          python-version: 3.8
+      - name: Cache python packages
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}}
+          restore-keys: ${{inputs.plugin}}
+      - name: Install dependencies
+        run: |
+            cd ${{ inputs.plugin }}
+            pip install git+https://github.com/kedro-org/kedro@main
+            pip install -r test_requirements.txt
+            pip freeze
+      - name: Install pre-commit hooks
+        run: |
+            cd ${{ inputs.plugin }}
+            pre-commit install --install-hooks
+            pre-commit install --hook-type pre-push
+      - name: Run linter
+        run: make plugin=${{ inputs.plugin }} lint
+
+  e2e-tests:
+    if: inputs.plugin != 'kedro-datasets'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.7", "3.8", "3.9", "3.10" ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python ${{matrix.python-version}}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.python-version}}
+      - name: Cache python packages
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}}
+          restore-keys: ${{inputs.plugin}}
+      - name: Install dependencies
+        run: |
+          cd ${{ inputs.plugin }}
+          pip install git+https://github.com/kedro-org/kedro@main
+          pip install -r test_requirements.txt
+      - name: pip freeze
+        run: pip freeze
+      - name: Run end to end tests
+        # Custom shell to run kedro-docker e2e-tests because -it flag for `docker run`
+        # isn't supported on Github Actions. See https://github.com/actions/runner/issues/241
+        shell: 'script -q -e -c "bash {0}"'
+        run: make plugin=${{ inputs.plugin }} e2e-tests
diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml
new file mode 100644
index 000000000..b68fcce30
--- /dev/null
+++ b/.github/workflows/kedro-airflow.yml
@@ -0,0 +1,16 @@
+name: Run checks on kedro-airflow
+
+on:
+  push:
+    paths:
+      - "kedro-airflow/**"
+  pull_request:
+    paths:
+      - "kedro-airflow/**"
+    types: [ synchronize ]
+
+jobs:
+  airflow-test:
+    uses: ./.github/workflows/check-plugin.yml
+    with:
+      plugin: kedro-airflow
diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml
new file mode 100644
index 000000000..9ff4802b6
--- /dev/null
+++ b/.github/workflows/kedro-datasets.yml
@@ -0,0 +1,16 @@
+name: Run checks on kedro-datasets
+
+on:
+  push:
+    paths:
+      - "kedro-datasets/**"
+  pull_request:
+    paths:
+      - "kedro-datasets/**"
+    types: [ synchronize ]
+
+jobs:
+  datasets-test:
+    uses: ./.github/workflows/check-plugin.yml
+    with:
+      plugin: kedro-datasets
diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml
new file mode 100644
index 000000000..1812a3a93
--- /dev/null
+++ b/.github/workflows/kedro-docker.yml
@@ -0,0 +1,16 @@
+name: Run checks on kedro-docker
+
+on:
+  push:
+    paths:
+      - "kedro-docker/**"
+  pull_request:
+    paths:
+      - "kedro-docker/**"
+    types: [ synchronize ]
+
+jobs:
+  docker-test:
+    uses: ./.github/workflows/check-plugin.yml
+    with:
+      plugin: kedro-docker
diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml
new file mode 100644
index 000000000..fd75e8a71
--- /dev/null
+++ b/.github/workflows/kedro-telemetry.yml
@@ -0,0 +1,16 @@
+name: Run checks on kedro-telemetry
+
+on:
+  push:
+    paths:
+      - "kedro-telemetry/**"
+  pull_request:
+    paths:
+      - "kedro-telemetry/**"
+    types: [ synchronize ]
+
+jobs:
+  telemetry-test:
+    uses: ./.github/workflows/check-plugin.yml
+    with:
+      plugin: kedro-telemetry
diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt
index 041fc7ffd..1929a2634 100644
--- a/trufflehog-ignore.txt
+++ b/trufflehog-ignore.txt
@@ -1 +1,3 @@
 kedro-telemetry/README.md
+kedro-telemetry/RELEASE.md
+kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py

From a73b216543f0ee726d85f2ffbb578038e75a8b50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?=
 <juan_luis_cano@mckinsey.com>
Date: Wed, 19 Apr 2023 17:08:42 +0200
Subject: [PATCH 19/96] Migrate `kedro-docker` to static metadata (#173)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Migrate kedro-docker to static metadata

See https://github.com/kedro-org/kedro/issues/2334.

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Address packaging warning

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Fix tests

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Actually install current plugin with dependencies

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Add release notes

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .circleci/continue_config.yml        |  1 +
 kedro-docker/MANIFEST.in             |  1 +
 kedro-docker/RELEASE.md              |  1 +
 kedro-docker/features/environment.py |  2 +-
 kedro-docker/pyproject.toml          | 55 ++++++++++++++++++++++++++++
 kedro-docker/requirements.txt        |  3 --
 kedro-docker/setup.cfg               | 10 -----
 kedro-docker/setup.py                | 44 ----------------------
 kedro-docker/test_requirements.txt   |  1 -
 9 files changed, 59 insertions(+), 59 deletions(-)
 create mode 100644 kedro-docker/MANIFEST.in
 delete mode 100644 kedro-docker/requirements.txt
 delete mode 100644 kedro-docker/setup.cfg
 delete mode 100644 kedro-docker/setup.py

diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
index 5a1d78015..82653758e 100644
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@@ -69,6 +69,7 @@ commands:
           command: |
             cd <<parameters.plugin>>
             pip install git+https://github.com/kedro-org/kedro@main
+            pip install .
             pip install -r test_requirements.txt
       - run:
           name: Install pre-commit hooks
diff --git a/kedro-docker/MANIFEST.in b/kedro-docker/MANIFEST.in
new file mode 100644
index 000000000..451642d6f
--- /dev/null
+++ b/kedro-docker/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include kedro_docker/template *
diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md
index eeb2f0e41..4bd5b8bbd 100644
--- a/kedro-docker/RELEASE.md
+++ b/kedro-docker/RELEASE.md
@@ -1,4 +1,5 @@
 # Upcoming release
+* Migrate all project metadata to static `pyproject.toml`.
 
 ## Major features and improvements
 
diff --git a/kedro-docker/features/environment.py b/kedro-docker/features/environment.py
index 04a5f25cf..930f97a7d 100644
--- a/kedro-docker/features/environment.py
+++ b/kedro-docker/features/environment.py
@@ -51,7 +51,7 @@ def before_all(context):
     )
 
     # install the plugin
-    call([context.python, "setup.py", "install"], env=context.env)
+    call([context.python, "-m", "pip", "install", "."], env=context.env)
 
 
 def _setup_context_with_venv(context, venv_dir):
diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml
index 0b54e6e31..cdd273509 100644
--- a/kedro-docker/pyproject.toml
+++ b/kedro-docker/pyproject.toml
@@ -1,3 +1,58 @@
+[build-system]
+requires = ["setuptools>=61.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kedro-docker"
+authors = [
+    {name = "Kedro"}
+]
+description = "Kedro-Docker makes it easy to package Kedro projects with Docker."
+requires-python = ">=3.7, <3.11"
+license = {text = "Apache Software License (Apache 2.0)"}
+dependencies = [
+    "anyconfig~=0.10.0",  # not directly required, pinned by Snyk to avoid a vulnerability
+    "kedro>=0.16.0",
+    "semver~=2.10",  # Needs to be at least 2.10.0 to get VersionInfo.match
+]
+dynamic = ["readme", "version"]
+
+[project.urls]
+Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker"
+Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-docker/README.md"
+Tracker = "https://github.com/kedro-org/kedro-plugins/issues"
+
+[project.entry-points."kedro.project_commands"]
+docker = "kedro_docker.plugin:commands"
+
+[tool.setuptools]
+include-package-data = true
+zip-safe = false
+
+[tool.setuptools.packages.find]
+include = ["kedro_docker*"]
+namespaces = true  # To include the template files
+
+[tool.setuptools.package-data]
+kedro_docker = [
+    "template/Dockerfile.*",
+    "template/.dockerignore",
+    "template/.dive-ci",
+]
+
+[tool.setuptools.dynamic]
+readme = {file = "README.md", content-type = "text/markdown"}
+version = {attr = "kedro_docker.__version__"}
+
+[tool.pytest.ini_options]
+addopts = """
+    --cov-report xml:coverage.xml
+    --cov-report term-missing
+    --cov kedro_docker
+    --cov tests
+    --no-cov-on-fail
+    -ra"""
+
 [tool.black]
 
 [tool.isort]
diff --git a/kedro-docker/requirements.txt b/kedro-docker/requirements.txt
deleted file mode 100644
index 86c576113..000000000
--- a/kedro-docker/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-anyconfig~=0.10.0 # not directly required, pinned by Snyk to avoid a vulnerability
-kedro>=0.16.0
-semver~=2.10 # Needs to be at least 2.10.0 to get VersionInfo.match
diff --git a/kedro-docker/setup.cfg b/kedro-docker/setup.cfg
deleted file mode 100644
index 9ba92fe11..000000000
--- a/kedro-docker/setup.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-[metadata]
-description-file=README.md
-
-[tool:pytest]
-addopts=--cov-report xml:coverage.xml
-        --cov-report term-missing
-        --cov kedro_docker
-        --cov tests
-        --no-cov-on-fail
-        -ra
diff --git a/kedro-docker/setup.py b/kedro-docker/setup.py
deleted file mode 100644
index b2ef23ca3..000000000
--- a/kedro-docker/setup.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import re
-from codecs import open
-from os import path
-
-from setuptools import setup
-
-name = "kedro-docker"
-here = path.abspath(path.dirname(__file__))
-
-# get package version
-package_name = name.replace("-", "_")
-with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f:
-    version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1)
-
-# get the dependencies and installs
-with open("requirements.txt", "r", encoding="utf-8") as f:
-    requires = [x.strip() for x in f if x.strip()]
-
-# get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
-    readme = f.read()
-
-setup(
-    name=name,
-    version=version,
-    description="Kedro-Docker makes it easy to package Kedro projects with Docker.",
-    long_description=readme,
-    long_description_content_type="text/markdown",
-    url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker",
-    license="Apache Software License (Apache 2.0)",
-    python_requires=">=3.7, <3.11",
-    install_requires=requires,
-    author="Kedro",
-    packages=["kedro_docker"],
-    package_data={
-        "kedro_docker": [
-            "template/Dockerfile.*",
-            "template/.dockerignore",
-            "template/.dive-ci",
-        ]
-    },
-    zip_safe=False,
-    entry_points={"kedro.project_commands": ["docker = kedro_docker.plugin:commands"]},
-)
diff --git a/kedro-docker/test_requirements.txt b/kedro-docker/test_requirements.txt
index 771ee88a6..01af755ac 100644
--- a/kedro-docker/test_requirements.txt
+++ b/kedro-docker/test_requirements.txt
@@ -1,4 +1,3 @@
--r requirements.txt
 bandit>=1.6.2, <2.0
 behave>=1.2.6, <2.0
 black~=22.0

From 7f4527dc3fbd4cb0922a6757b1f7c1dc80ce98b2 Mon Sep 17 00:00:00 2001
From: Nok Lam Chan <mediumnok@gmail.com>
Date: Fri, 21 Apr 2023 16:32:23 +0100
Subject: [PATCH 20/96] Introdcuing .gitpod.yml to kedro-plugins (#185)

Currently opening gitpod will installed a Python 3.11 which breaks everything because we don't support it set. This PR introduce a simple .gitpod.yml to get it started.

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .gitpod.yml | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .gitpod.yml

diff --git a/.gitpod.yml b/.gitpod.yml
new file mode 100644
index 000000000..70738f4c0
--- /dev/null
+++ b/.gitpod.yml
@@ -0,0 +1,33 @@
+# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart
+image: gitpod/workspace-python-3.10:2023-04-20-16-32-37
+
+
+tasks:
+  # We want packages installed during the pre-build init steps to go to /workspace
+  # rather than ~ so that they are persisted. Gitpod sets PIP_USER=yes to ensure this,
+  # but pre-commit requires PIP_USER=no. Hence we set PIP_USER=no and use
+  # pip install --user to install to /workspace.
+  - name: kedro-plugins
+    before: |
+      echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no
+    init: |
+      make sign-off
+    command: |
+      pre-commit install --install-hooks
+      clear
+
+
+github:
+  prebuilds:
+    # enable for the master/default branch (defaults to true)
+    master: true
+    # enable for all branches in this repo (defaults to false)
+    branches: true
+    # enable for pull requests coming from this repo (defaults to true)
+    pullRequests: true
+    # enable for pull requests coming from forks (defaults to false)
+    pullRequestsFromForks: true
+    # add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
+    addComment: false
+    # add a "Review in Gitpod" button to pull requests (defaults to false)
+    addBadge: true

From 57a11d61801ff9ce66f8b9a842fd58031e552b81 Mon Sep 17 00:00:00 2001
From: Nok Lam Chan <mediumnok@gmail.com>
Date: Mon, 24 Apr 2023 13:32:52 +0100
Subject: [PATCH 21/96] sync APIDataSet  from kedro's `develop` (#184)

* Update APIDataSet

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Sync ParquetDataSet

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Sync Test

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Linting

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Revert Unnecessary ParquetDataSet Changes

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Sync release notes

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

---------

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/RELEASE.md                     |   2 +-
 .../kedro_datasets/api/api_dataset.py         | 111 +++----
 kedro-datasets/tests/api/test_api_dataset.py  | 273 ++++++++++++------
 .../matplotlib/test_matplotlib_writer.py      |   2 -
 .../tests/polars/test_csv_dataset.py          |   1 -
 5 files changed, 242 insertions(+), 147 deletions(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index 01a3b92dc..e1185b54d 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -3,7 +3,7 @@
 ## Major features and improvements:
 * Added pandas 2.0 support.
 * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
-
+* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
 
diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
index 4f0ffb4cc..cb8f80d37 100644
--- a/kedro-datasets/kedro_datasets/api/api_dataset.py
+++ b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -1,12 +1,17 @@
 """``APIDataSet`` loads the data from HTTP(S) APIs.
 It uses the python requests library: https://requests.readthedocs.io/en/latest/
 """
-from typing import Any, Dict, Iterable, List, NoReturn, Union
+from typing import Any, Dict, List, NoReturn, Tuple, Union
 
 import requests
 from kedro.io.core import AbstractDataSet, DataSetError
+from requests import Session, sessions
 from requests.auth import AuthBase
 
+# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0.
+# Any contribution to datasets should be made in kedro-datasets
+# in kedro-plugins (https://github.com/kedro-org/kedro-plugins)
+
 
 class APIDataSet(AbstractDataSet[None, requests.Response]):
     """``APIDataSet`` loads the data from HTTP(S) APIs.
@@ -34,88 +39,89 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
     data_catalog.html#use-the-data-catalog-with-the-code-api>`_:
     ::
 
-        >>> from kedro_datasets.api import APIDataSet
+        >>> from kedro.extras.datasets.api import APIDataSet
         >>>
         >>>
         >>> data_set = APIDataSet(
         >>>     url="https://quickstats.nass.usda.gov",
-        >>>     params={
-        >>>         "key": "SOME_TOKEN",
-        >>>         "format": "JSON",
-        >>>         "commodity_desc": "CORN",
-        >>>         "statisticcat_des": "YIELD",
-        >>>         "agg_level_desc": "STATE",
-        >>>         "year": 2000
-        >>>     }
+        >>>     load_args={
+        >>>         "params": {
+        >>>             "key": "SOME_TOKEN",
+        >>>             "format": "JSON",
+        >>>             "commodity_desc": "CORN",
+        >>>             "statisticcat_des": "YIELD",
+        >>>             "agg_level_desc": "STATE",
+        >>>             "year": 2000
+        >>>         }
+        >>>     },
+        >>>     credentials=("username", "password")
         >>> )
         >>> data = data_set.load()
     """
 
-    # pylint: disable=too-many-arguments
     def __init__(
         self,
         url: str,
         method: str = "GET",
-        data: Any = None,
-        params: Dict[str, Any] = None,
-        headers: Dict[str, Any] = None,
-        auth: Union[Iterable[str], AuthBase] = None,
-        json: Union[List, Dict[str, Any]] = None,
-        timeout: int = 60,
-        credentials: Union[Iterable[str], AuthBase] = None,
+        load_args: Dict[str, Any] = None,
+        credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
     ) -> None:
         """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.
 
         Args:
             url: The API URL endpoint.
             method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc...
-            data: The request payload, used for POST, PUT, etc requests
-                https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests
-            params: The url parameters of the API.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
-            headers: The HTTP headers.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
-            auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,
-                or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. Any
-                iterable will be cast to a tuple.
-            json: The request payload, used for POST, PUT, etc requests, passed in
-                to the json kwarg in the requests object.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests
-            timeout: The wait time in seconds for a response, defaults to 1 minute.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#timeouts
-            credentials: same as ``auth``. Allows specifying ``auth`` secrets in
-                credentials.yml.
-
+            load_args: Additional parameters to be fed to requests.request.
+                https://requests.readthedocs.io/en/latest/api/#requests.request
+            credentials: Allows specifying secrets in credentials.yml.
+                Expected format is ``('login', 'password')`` if given as a tuple or list.
+                An ``AuthBase`` instance can be provided for more complex cases.
         Raises:
-            ValueError: if both ``credentials`` and ``auth`` are specified.
+            ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified.
         """
         super().__init__()
 
-        if credentials is not None and auth is not None:
+        self._load_args = load_args or {}
+        self._load_args_auth = self._load_args.pop("auth", None)
+
+        if credentials is not None and self._load_args_auth is not None:
             raise ValueError("Cannot specify both auth and credentials.")
 
-        auth = credentials or auth
+        self._auth = credentials or self._load_args_auth
+
+        if "cert" in self._load_args:
+            self._load_args["cert"] = self._convert_type(self._load_args["cert"])
 
-        if isinstance(auth, Iterable):
-            auth = tuple(auth)
+        if "timeout" in self._load_args:
+            self._load_args["timeout"] = self._convert_type(self._load_args["timeout"])
 
         self._request_args: Dict[str, Any] = {
             "url": url,
             "method": method,
-            "data": data,
-            "params": params,
-            "headers": headers,
-            "auth": auth,
-            "json": json,
-            "timeout": timeout,
+            "auth": self._convert_type(self._auth),
+            **self._load_args,
         }
 
+    @staticmethod
+    def _convert_type(value: Any):
+        """
+        From the Data Catalog, iterables are provided as Lists.
+        However, for some parameters in the Python requests library,
+        only Tuples are allowed.
+        """
+        if isinstance(value, List):
+            return tuple(value)
+        return value
+
     def _describe(self) -> Dict[str, Any]:
-        return {**self._request_args}
+        # prevent auth from logging
+        request_args_cp = self._request_args.copy()
+        request_args_cp.pop("auth", None)
+        return request_args_cp
 
-    def _execute_request(self) -> requests.Response:
+    def _execute_request(self, session: Session) -> requests.Response:
         try:
-            response = requests.request(**self._request_args)
+            response = session.request(**self._request_args)
             response.raise_for_status()
         except requests.exceptions.HTTPError as exc:
             raise DataSetError("Failed to fetch data", exc) from exc
@@ -125,12 +131,13 @@ def _execute_request(self) -> requests.Response:
         return response
 
     def _load(self) -> requests.Response:
-        return self._execute_request()
+        with sessions.Session() as session:
+            return self._execute_request(session)
 
     def _save(self, data: None) -> NoReturn:
         raise DataSetError(f"{self.__class__.__name__} is a read only data set type")
 
     def _exists(self) -> bool:
-        response = self._execute_request()
-
+        with sessions.Session() as session:
+            response = self._execute_request(session)
         return response.ok
diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py
index c84290750..848020041 100644
--- a/kedro-datasets/tests/api/test_api_dataset.py
+++ b/kedro-datasets/tests/api/test_api_dataset.py
@@ -1,11 +1,11 @@
 # pylint: disable=no-member
-import json
+import base64
 import socket
 
 import pytest
 import requests
-import requests_mock
 from kedro.io.core import DataSetError
+from requests.auth import HTTPBasicAuth
 
 from kedro_datasets.api import APIDataSet
 
@@ -13,96 +13,190 @@
 
 TEST_URL = "http://example.com/api/test"
 TEST_TEXT_RESPONSE_DATA = "This is a response."
-TEST_JSON_RESPONSE_DATA = [{"key": "value"}]
+TEST_JSON_REQUEST_DATA = [{"key": "value"}]
 
 TEST_PARAMS = {"param": "value"}
 TEST_URL_WITH_PARAMS = TEST_URL + "?param=value"
-
+TEST_METHOD = "GET"
 TEST_HEADERS = {"key": "value"}
 
 
-@pytest.mark.parametrize("method", POSSIBLE_METHODS)
 class TestAPIDataSet:
-    @pytest.fixture
-    def requests_mocker(self):
-        with requests_mock.Mocker() as mock:
-            yield mock
+    @pytest.mark.parametrize("method", POSSIBLE_METHODS)
+    def test_request_method(self, requests_mock, method):
+        api_data_set = APIDataSet(url=TEST_URL, method=method)
+        requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA)
+
+        response = api_data_set.load()
+        assert response.text == TEST_TEXT_RESPONSE_DATA
 
-    def test_successfully_load_with_response(self, requests_mocker, method):
+    @pytest.mark.parametrize(
+        "parameters_in, url_postfix",
+        [
+            ({"param": "value"}, "?param=value"),
+            (bytes("a=1", "latin-1"), "?a=1"),
+        ],
+    )
+    def test_params_in_request(self, requests_mock, parameters_in, url_postfix):
         api_data_set = APIDataSet(
-            url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS
+            url=TEST_URL, method=TEST_METHOD, load_args={"params": parameters_in}
         )
-        requests_mocker.register_uri(
-            method,
-            TEST_URL_WITH_PARAMS,
-            headers=TEST_HEADERS,
-            text=TEST_TEXT_RESPONSE_DATA,
+        requests_mock.register_uri(
+            TEST_METHOD, TEST_URL + url_postfix, text=TEST_TEXT_RESPONSE_DATA
         )
 
         response = api_data_set.load()
         assert isinstance(response, requests.Response)
         assert response.text == TEST_TEXT_RESPONSE_DATA
 
-    def test_successful_json_load_with_response(self, requests_mocker, method):
+    def test_json_in_request(self, requests_mock):
         api_data_set = APIDataSet(
             url=TEST_URL,
-            method=method,
-            json=TEST_JSON_RESPONSE_DATA,
-            headers=TEST_HEADERS,
+            method=TEST_METHOD,
+            load_args={"json": TEST_JSON_REQUEST_DATA},
         )
-        requests_mocker.register_uri(
-            method,
+        requests_mock.register_uri(TEST_METHOD, TEST_URL)
+
+        response = api_data_set.load()
+        assert response.request.json() == TEST_JSON_REQUEST_DATA
+
+    def test_headers_in_request(self, requests_mock):
+        api_data_set = APIDataSet(
+            url=TEST_URL, method=TEST_METHOD, load_args={"headers": TEST_HEADERS}
+        )
+        requests_mock.register_uri(TEST_METHOD, TEST_URL, headers={"pan": "cake"})
+
+        response = api_data_set.load()
+
+        assert response.request.headers["key"] == "value"
+        assert response.headers["pan"] == "cake"
+
+    def test_api_cookies(self, requests_mock):
+        api_data_set = APIDataSet(
+            url=TEST_URL, method=TEST_METHOD, load_args={"cookies": {"pan": "cake"}}
+        )
+        requests_mock.register_uri(TEST_METHOD, TEST_URL, text="text")
+
+        response = api_data_set.load()
+        assert response.request.headers["Cookie"] == "pan=cake"
+
+    def test_credentials_auth_error(self):
+        """
+        If ``auth`` in ``load_args`` and ``credentials`` are both provided,
+        the constructor should raise a ValueError.
+        """
+        with pytest.raises(ValueError, match="both auth and credentials"):
+            APIDataSet(
+                url=TEST_URL, method=TEST_METHOD, load_args={"auth": []}, credentials={}
+            )
+
+    @staticmethod
+    def _basic_auth(username, password):
+        encoded = base64.b64encode(f"{username}:{password}".encode("latin-1"))
+        return f"Basic {encoded.decode('latin-1')}"
+
+    @pytest.mark.parametrize(
+        "auth_kwarg",
+        [
+            {"load_args": {"auth": ("john", "doe")}},
+            {"load_args": {"auth": ["john", "doe"]}},
+            {"load_args": {"auth": HTTPBasicAuth("john", "doe")}},
+            {"credentials": ("john", "doe")},
+            {"credentials": ["john", "doe"]},
+            {"credentials": HTTPBasicAuth("john", "doe")},
+        ],
+    )
+    def test_auth_sequence(self, requests_mock, auth_kwarg):
+        api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD, **auth_kwarg)
+        requests_mock.register_uri(
+            TEST_METHOD,
             TEST_URL,
-            headers=TEST_HEADERS,
-            text=json.dumps(TEST_JSON_RESPONSE_DATA),
+            text=TEST_TEXT_RESPONSE_DATA,
         )
 
         response = api_data_set.load()
         assert isinstance(response, requests.Response)
-        assert response.json() == TEST_JSON_RESPONSE_DATA
+        assert response.request.headers["Authorization"] == TestAPIDataSet._basic_auth(
+            "john", "doe"
+        )
+        assert response.text == TEST_TEXT_RESPONSE_DATA
 
-    def test_http_error(self, requests_mocker, method):
+    @pytest.mark.parametrize(
+        "timeout_in, timeout_out",
+        [
+            (1, 1),
+            ((1, 2), (1, 2)),
+            ([1, 2], (1, 2)),
+        ],
+    )
+    def test_api_timeout(self, requests_mock, timeout_in, timeout_out):
         api_data_set = APIDataSet(
-            url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS
+            url=TEST_URL, method=TEST_METHOD, load_args={"timeout": timeout_in}
         )
-        requests_mocker.register_uri(
-            method,
-            TEST_URL_WITH_PARAMS,
-            headers=TEST_HEADERS,
-            text="Nope, not found",
-            status_code=requests.codes.FORBIDDEN,
+        requests_mock.register_uri(TEST_METHOD, TEST_URL)
+        response = api_data_set.load()
+        assert response.request.timeout == timeout_out
+
+    def test_stream(self, requests_mock):
+        text = "I am being streamed."
+
+        api_data_set = APIDataSet(
+            url=TEST_URL, method=TEST_METHOD, load_args={"stream": True}
         )
 
-        with pytest.raises(DataSetError, match="Failed to fetch data"):
-            api_data_set.load()
+        requests_mock.register_uri(TEST_METHOD, TEST_URL, text=text)
+
+        response = api_data_set.load()
+        assert isinstance(response, requests.Response)
+        assert response.request.stream
+
+        chunks = list(response.iter_content(chunk_size=2, decode_unicode=True))
+        assert chunks == ["I ", "am", " b", "ei", "ng", " s", "tr", "ea", "me", "d."]
 
-    def test_socket_error(self, requests_mocker, method):
+    def test_proxy(self, requests_mock):
         api_data_set = APIDataSet(
-            url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS
+            url="ftp://example.com/api/test",
+            method=TEST_METHOD,
+            load_args={"proxies": {"ftp": "ftp://127.0.0.1:3000"}},
+        )
+        requests_mock.register_uri(
+            TEST_METHOD,
+            "ftp://example.com/api/test",
         )
-        requests_mocker.register_uri(method, TEST_URL_WITH_PARAMS, exc=socket.error)
 
-        with pytest.raises(DataSetError, match="Failed to connect"):
-            api_data_set.load()
+        response = api_data_set.load()
+        assert response.request.proxies.get("ftp") == "ftp://127.0.0.1:3000"
 
-    def test_read_only_mode(self, method):
-        """
-        Saving is disabled on the data set.
-        """
-        api_data_set = APIDataSet(url=TEST_URL, method=method)
-        with pytest.raises(DataSetError, match="is a read only data set type"):
-            api_data_set.save({})
+    @pytest.mark.parametrize(
+        "cert_in, cert_out",
+        [
+            (("cert.pem", "privkey.pem"), ("cert.pem", "privkey.pem")),
+            (["cert.pem", "privkey.pem"], ("cert.pem", "privkey.pem")),
+            ("some/path/to/file.pem", "some/path/to/file.pem"),
+            (None, None),
+        ],
+    )
+    def test_certs(self, requests_mock, cert_in, cert_out):
+        api_data_set = APIDataSet(
+            url=TEST_URL, method=TEST_METHOD, load_args={"cert": cert_in}
+        )
+        requests_mock.register_uri(TEST_METHOD, TEST_URL)
 
-    def test_exists_http_error(self, requests_mocker, method):
+        response = api_data_set.load()
+        assert response.request.cert == cert_out
+
+    def test_exists_http_error(self, requests_mock):
         """
         In case of an unexpected HTTP error,
         ``exists()`` should not silently catch it.
         """
         api_data_set = APIDataSet(
-            url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS
+            url=TEST_URL,
+            method=TEST_METHOD,
+            load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
         )
-        requests_mocker.register_uri(
-            method,
+        requests_mock.register_uri(
+            TEST_METHOD,
             TEST_URL_WITH_PARAMS,
             headers=TEST_HEADERS,
             text="Nope, not found",
@@ -111,16 +205,18 @@ def test_exists_http_error(self, requests_mocker, method):
         with pytest.raises(DataSetError, match="Failed to fetch data"):
             api_data_set.exists()
 
-    def test_exists_ok(self, requests_mocker, method):
+    def test_exists_ok(self, requests_mock):
         """
         If the file actually exists and server responds 200,
         ``exists()`` should return True
         """
         api_data_set = APIDataSet(
-            url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS
+            url=TEST_URL,
+            method=TEST_METHOD,
+            load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
         )
-        requests_mocker.register_uri(
-            method,
+        requests_mock.register_uri(
+            TEST_METHOD,
             TEST_URL_WITH_PARAMS,
             headers=TEST_HEADERS,
             text=TEST_TEXT_RESPONSE_DATA,
@@ -128,43 +224,38 @@ def test_exists_ok(self, requests_mocker, method):
 
         assert api_data_set.exists()
 
-    def test_credentials_auth_error(self, method):
-        """
-        If ``auth`` and ``credentials`` are both provided,
-        the constructor should raise a ValueError.
-        """
-        with pytest.raises(ValueError, match="both auth and credentials"):
-            APIDataSet(url=TEST_URL, method=method, auth=[], credentials=[])
-
-    @pytest.mark.parametrize("auth_kwarg", ["auth", "credentials"])
-    @pytest.mark.parametrize(
-        "auth_seq",
-        [
-            ("username", "password"),
-            ["username", "password"],
-            (e for e in ["username", "password"]),  # Generator.
-        ],
-    )
-    def test_auth_sequence(self, requests_mocker, method, auth_seq, auth_kwarg):
-        """
-        ``auth`` and ``credentials`` should be able to be any Iterable.
-        """
-        kwargs = {
-            "url": TEST_URL,
-            "method": method,
-            "params": TEST_PARAMS,
-            "headers": TEST_HEADERS,
-            auth_kwarg: auth_seq,
-        }
-
-        api_data_set = APIDataSet(**kwargs)
-        requests_mocker.register_uri(
-            method,
+    def test_http_error(self, requests_mock):
+        api_data_set = APIDataSet(
+            url=TEST_URL,
+            method=TEST_METHOD,
+            load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
+        )
+        requests_mock.register_uri(
+            TEST_METHOD,
             TEST_URL_WITH_PARAMS,
             headers=TEST_HEADERS,
-            text=TEST_TEXT_RESPONSE_DATA,
+            text="Nope, not found",
+            status_code=requests.codes.FORBIDDEN,
         )
 
-        response = api_data_set.load()
-        assert isinstance(response, requests.Response)
-        assert response.text == TEST_TEXT_RESPONSE_DATA
+        with pytest.raises(DataSetError, match="Failed to fetch data"):
+            api_data_set.load()
+
+    def test_socket_error(self, requests_mock):
+        api_data_set = APIDataSet(
+            url=TEST_URL,
+            method=TEST_METHOD,
+            load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
+        )
+        requests_mock.register_uri(TEST_METHOD, TEST_URL_WITH_PARAMS, exc=socket.error)
+
+        with pytest.raises(DataSetError, match="Failed to connect"):
+            api_data_set.load()
+
+    def test_read_only_mode(self):
+        """
+        Saving is disabled on the data set.
+        """
+        api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD)
+        with pytest.raises(DataSetError, match="is a read only data set type"):
+            api_data_set.save({})
diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
index 0745452c6..4086e127e 100644
--- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
+++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
@@ -170,7 +170,6 @@ def test_dict_save(self, tmp_path, mock_dict_plot, plot_writer, mocked_s3_bucket
         plot_writer.save(mock_dict_plot)
 
         for colour in COLOUR_LIST:
-
             download_path = tmp_path / "downloaded_image.png"
             actual_filepath = tmp_path / "locally_saved.png"
 
@@ -361,7 +360,6 @@ def test_list_save(self, tmp_path, mock_list_plot, versioned_plot_writer):
         versioned_plot_writer.save(mock_list_plot)
 
         for index in range(5):
-
             test_path = tmp_path / "test_image.png"
             versioned_filepath = str(versioned_plot_writer._get_load_path())
 
diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py
index 8b05a2025..d79183539 100644
--- a/kedro-datasets/tests/polars/test_csv_dataset.py
+++ b/kedro-datasets/tests/polars/test_csv_dataset.py
@@ -77,7 +77,6 @@ def mocked_dataframe():
 
 @pytest.fixture
 def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame):
-
     binarycsv = mocked_dataframe.write_csv()[:-1]
 
     mocked_s3_bucket.put_object(

From 11c3888a9930b1d10795dc0d82e240975382c7ab Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 25 Apr 2023 10:45:45 +0100
Subject: [PATCH 22/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 23 ++++++++----
 .../spark/test_spark_streaming_dataset.py     | 36 ++++++++++---------
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index a508a3903..4cb19e6e5 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -7,7 +7,12 @@
 import yaml
 
 import fsspec
-from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path
+from kedro.io.core import (
+    AbstractDataSet,
+    DataSetError,
+    get_filepath_str,
+    get_protocol_and_path,
+)
 from pyspark import SparkConf
 from pyspark.sql.utils import AnalysisException
 from pyspark.sql import SparkSession, DataFrame
@@ -117,7 +122,7 @@ def _load_schema_from_file(schema: Dict[str, Any]) -> StructType:
         load_path = get_filepath_str(pure_posix_path, protocol)
 
         # Open schema file
-        with file_system.open(load_path, encoding='utf-8') as fs_file:
+        with file_system.open(load_path, encoding="utf-8") as fs_file:
             try:
                 return StructType.fromJson(json.loads(fs_file.read()))
             except Exception as exc:
@@ -159,7 +164,8 @@ def _load(self) -> DataFrame:
         if self._schema:
             input_constructor = (
                 self._get_spark()
-                .readStream.schema(self._schema).format(self._file_format)
+                .readStream.schema(self._schema)
+                .format(self._file_format)
                 .options(**self._load_args)
             )
         else:
@@ -197,17 +203,20 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
-    def _exists(self, schema_path:str) -> bool:
+
+    def _exists(self, schema_path: str) -> bool:
         """Check the existence of pyspark dataframe.
 
         Args:
             schema_path: schema of saved streaming dataframe
         """
         load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
-        with open(schema_path, encoding='utf-8') as f:
+        with open(schema_path, encoding="utf-8") as f:
             schema = StructType.fromJson(json.loads(f.read()))
         try:
-            self._get_spark().readStream.schema(schema).load(load_path, self._file_format)
+            self._get_spark().readStream.schema(schema).load(
+                load_path, self._file_format
+            )
         except AnalysisException as exception:
             if (
                 exception.desc.startswith("Path does not exist:")
@@ -215,4 +224,4 @@ def _exists(self, schema_path:str) -> bool:
             ):
                 return False
             raise
-        return True
\ No newline at end of file
+        return True
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index fa3b0fec8..f2fd3bb3d 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,17 +1,14 @@
 import json
 import pytest
-import time
 from pyspark.sql import SparkSession
-from kedro_datasets.pandas import ParquetDataSet
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from kedro.io.core import DataSetError
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-
 
 
 def sample_schema(schema_path):
-    with open(schema_path, encoding='utf-8') as f:
+    with open(schema_path, encoding="utf-8") as f:
         try:
             return StructType.fromJson(json.loads(f.read()))
         except Exception as exc:
@@ -20,6 +17,7 @@ def sample_schema(schema_path):
                 f"Schema is required for streaming data load, Please provide a valid schema_path."
             ) from exc
 
+
 @pytest.fixture
 def sample_spark_streaming_df(tmp_path):
     schema = StructType(
@@ -41,12 +39,15 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         schema_path = (tmp_path / "test.json").as_posix()
 
         spark_json_ds = SparkDataSet(
-            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+            filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}]
         )
         spark_json_ds.save(sample_spark_streaming_df)
 
-        streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json",
-                                          load_args={"schema": {"filepath": schema_path}}).load()
+        streaming_ds = SparkStreamingDataSet(
+            filepath=filepath,
+            file_format="json",
+            load_args={"schema": {"filepath": schema_path}},
+        ).load()
         assert streaming_ds.isStreaming
         schema = sample_schema(schema_path)
         assert streaming_ds.schema == schema
@@ -58,20 +59,23 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
         checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
         spark_json_ds = SparkDataSet(
-            filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}]
+            filepath=filepath_json,
+            file_format="json",
+            save_args=[{"mode", "overwrite"}],
         )
         spark_json_ds.save(sample_spark_streaming_df)
-        loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json",
-                                          load_args={"schema": {"filepath": schema_path}}).load()
-
+        loaded_with_streaming = SparkStreamingDataSet(
+            filepath=filepath_json,
+            file_format="json",
+            load_args={"schema": {"filepath": schema_path}},
+        ).load()
 
         streaming_ds = SparkStreamingDataSet(
-            filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
+            filepath=filepath_output,
+            file_format="json",
+            save_args={"checkpoint": checkpoint_path, "output_mode": "append"},
         )
         assert not streaming_ds._exists(schema_path)
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds._exists(schema_path)
-
-
-

From 634d884576cb71609ca7a2d8746871727e3181f0 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 1 May 2023 17:29:02 +0100
Subject: [PATCH 23/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/__init__.py     |  8 +++++++-
 .../kedro_datasets/spark/deltatable_dataset.py      |  3 +--
 .../kedro_datasets/spark/spark_streaming_dataset.py | 10 +++++-----
 .../kedro_datasets/tracking/json_dataset.py         |  1 -
 .../kedro_datasets/tracking/metrics_dataset.py      |  1 -
 kedro-datasets/setup.py                             | 13 ++++++++-----
 kedro-datasets/tests/api/test_api_dataset.py        |  3 +--
 .../bioinformatics/test_biosequence_dataset.py      |  3 +--
 kedro-datasets/tests/dask/test_parquet_dataset.py   |  3 +--
 kedro-datasets/tests/email/test_message_dataset.py  |  3 +--
 .../tests/geojson/test_geojson_dataset.py           |  3 +--
 .../tests/holoviews/test_holoviews_writer.py        |  3 +--
 kedro-datasets/tests/json/test_json_dataset.py      |  3 +--
 .../tests/libsvm/test_svmlight_dataset.py           |  3 +--
 .../tests/matplotlib/test_matplotlib_writer.py      |  3 +--
 kedro-datasets/tests/networkx/test_gml_dataset.py   |  3 +--
 .../tests/networkx/test_graphml_dataset.py          |  3 +--
 kedro-datasets/tests/networkx/test_json_dataset.py  |  3 +--
 kedro-datasets/tests/pandas/test_csv_dataset.py     |  3 +--
 kedro-datasets/tests/pandas/test_excel_dataset.py   |  3 +--
 kedro-datasets/tests/pandas/test_feather_dataset.py |  3 +--
 kedro-datasets/tests/pandas/test_gbq_dataset.py     |  3 +--
 kedro-datasets/tests/pandas/test_generic_dataset.py |  3 +--
 kedro-datasets/tests/pandas/test_hdf_dataset.py     |  3 +--
 kedro-datasets/tests/pandas/test_json_dataset.py    |  3 +--
 kedro-datasets/tests/pandas/test_parquet_dataset.py |  3 +--
 kedro-datasets/tests/pandas/test_sql_dataset.py     |  1 -
 kedro-datasets/tests/pandas/test_xml_dataset.py     |  3 +--
 kedro-datasets/tests/pickle/test_pickle_dataset.py  |  3 +--
 kedro-datasets/tests/pillow/test_image_dataset.py   |  3 +--
 kedro-datasets/tests/plotly/test_json_dataset.py    |  3 +--
 kedro-datasets/tests/plotly/test_plotly_dataset.py  |  3 +--
 kedro-datasets/tests/polars/test_csv_dataset.py     |  3 +--
 kedro-datasets/tests/redis/test_redis_dataset.py    |  3 +--
 .../tests/snowflake/test_snowpark_dataset.py        |  1 -
 .../tests/spark/test_deltatable_dataset.py          |  3 +--
 kedro-datasets/tests/spark/test_spark_dataset.py    |  9 ++++-----
 .../tests/spark/test_spark_hive_dataset.py          |  3 +--
 .../tests/spark/test_spark_jdbc_dataset.py          |  1 -
 .../tests/spark/test_spark_streaming_dataset.py     |  5 +++--
 kedro-datasets/tests/text/test_text_dataset.py      |  3 +--
 kedro-datasets/tests/tracking/test_json_dataset.py  |  3 +--
 .../tests/tracking/test_metrics_dataset.py          |  3 +--
 kedro-datasets/tests/video/conftest.py              |  3 +--
 kedro-datasets/tests/video/test_video_dataset.py    |  5 ++---
 kedro-datasets/tests/video/test_video_objects.py    |  3 +--
 kedro-datasets/tests/yaml/test_yaml_dataset.py      |  3 +--
 kedro-docker/features/steps/cli_steps.py            |  1 -
 kedro-docker/kedro_docker/plugin.py                 |  4 ++--
 kedro-docker/tests/test_helpers.py                  |  1 -
 kedro-telemetry/kedro_telemetry/plugin.py           |  1 -
 kedro-telemetry/tests/test_masking.py               |  1 -
 kedro-telemetry/tests/test_plugin.py                |  3 +--
 tools/circleci/circleci_release.py                  |  1 -
 54 files changed, 68 insertions(+), 107 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py
index 0c46a7fc3..bd649f5c7 100644
--- a/kedro-datasets/kedro_datasets/spark/__init__.py
+++ b/kedro-datasets/kedro_datasets/spark/__init__.py
@@ -1,6 +1,12 @@
 """Provides I/O modules for Apache Spark."""
 
-__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"]
+__all__ = [
+    "SparkDataSet",
+    "SparkHiveDataSet",
+    "SparkJDBCDataSet",
+    "DeltaTableDataSet",
+    "SparkStreamingDataSet",
+]
 
 from contextlib import suppress
 
diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
index 34ee6f6a5..9454a47f7 100644
--- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
@@ -6,11 +6,10 @@
 
 from delta.tables import DeltaTable
 from kedro.io.core import AbstractDataSet, DataSetError
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark.sql import SparkSession
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
-
 
 class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]):
     """``DeltaTableDataSet`` loads data into DeltaTable objects.
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 4cb19e6e5..203539a11 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -1,24 +1,24 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
 import json
 import os
-from typing import Any, Dict
 from copy import deepcopy
 from pathlib import PurePosixPath
-import yaml
+from typing import Any, Dict
 
 import fsspec
+import yaml
 from kedro.io.core import (
     AbstractDataSet,
     DataSetError,
     get_filepath_str,
     get_protocol_and_path,
 )
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark import SparkConf
-from pyspark.sql.utils import AnalysisException
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
+from pyspark.sql.utils import AnalysisException
 from yaml.loader import SafeLoader
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
 
 class SparkStreamingDataSet(AbstractDataSet):
diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py
index 4235df999..994236d3d 100644
--- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py
@@ -5,7 +5,6 @@
 from typing import NoReturn
 
 from kedro.io.core import DataSetError
-
 from kedro_datasets.json import JSONDataSet as JDS
 
 
diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
index 7c7546a85..2e4e2d970 100644
--- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
+++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
@@ -7,7 +7,6 @@
 from typing import Dict, NoReturn
 
 from kedro.io.core import DataSetError, get_filepath_str
-
 from kedro_datasets.json import JSONDataSet
 
 
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index 99c30938e..63f41baf7 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -46,10 +46,15 @@ def _collect_requirements(requires):
     "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"],
     "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"],
 }
-polars_require = {"polars.CSVDataSet": [POLARS],}
+polars_require = {
+    "polars.CSVDataSet": [POLARS],
+}
 redis_require = {"redis.PickleDataSet": ["redis~=4.1"]}
 snowflake_require = {
-    "snowflake.SnowparkTableDataSet": ["snowflake-snowpark-python~=1.0.0", "pyarrow~=8.0"]
+    "snowflake.SnowparkTableDataSet": [
+        "snowflake-snowpark-python~=1.0.0",
+        "pyarrow~=8.0",
+    ]
 }
 spark_require = {
     "spark.SparkDataSet": [SPARK, HDFS, S3FS],
@@ -67,9 +72,7 @@ def _collect_requirements(requires):
         "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
     ]
 }
-video_require = {
-    "video.VideoDataSet": ["opencv-python~=4.5.5.64"]
-}
+video_require = {"video.VideoDataSet": ["opencv-python~=4.5.5.64"]}
 yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]}
 
 extras_require = {
diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py
index 848020041..51279c71c 100644
--- a/kedro-datasets/tests/api/test_api_dataset.py
+++ b/kedro-datasets/tests/api/test_api_dataset.py
@@ -5,9 +5,8 @@
 import pytest
 import requests
 from kedro.io.core import DataSetError
-from requests.auth import HTTPBasicAuth
-
 from kedro_datasets.api import APIDataSet
+from requests.auth import HTTPBasicAuth
 
 POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
 
diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
index 24666baaf..42b3e252f 100644
--- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
+++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
@@ -8,9 +8,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.biosequence import BioSequenceDataSet
+from s3fs.core import S3FileSystem
 
 LOAD_ARGS = {"format": "fasta"}
 SAVE_ARGS = {"format": "fasta"}
diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py
index 8475dbf47..3824d6c0f 100644
--- a/kedro-datasets/tests/dask/test_parquet_dataset.py
+++ b/kedro-datasets/tests/dask/test_parquet_dataset.py
@@ -5,12 +5,11 @@
 import pyarrow.parquet as pq
 import pytest
 from kedro.io import DataSetError
+from kedro_datasets.dask import ParquetDataSet
 from moto import mock_s3
 from pandas.testing import assert_frame_equal
 from s3fs import S3FileSystem
 
-from kedro_datasets.dask import ParquetDataSet
-
 FILE_NAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
 AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}
diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py
index 100daba52..6f97b6c89 100644
--- a/kedro-datasets/tests/email/test_message_dataset.py
+++ b/kedro-datasets/tests/email/test_message_dataset.py
@@ -8,9 +8,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.email import EmailMessageDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py
index b5f3ec4cb..cd6c07c7c 100644
--- a/kedro-datasets/tests/geojson/test_geojson_dataset.py
+++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py
@@ -7,12 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.geopandas import GeoJSONDataSet
 from pandas.testing import assert_frame_equal
 from s3fs import S3FileSystem
 from shapely.geometry import Point
 
-from kedro_datasets.geopandas import GeoJSONDataSet
-
 
 @pytest.fixture(params=[None])
 def load_version(request):
diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py
index f4f91383e..53ca795f2 100644
--- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py
+++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py
@@ -9,9 +9,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.holoviews import HoloviewsWriter
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py
index 621e51fcd..dafdd8e3e 100644
--- a/kedro-datasets/tests/json/test_json_dataset.py
+++ b/kedro-datasets/tests/json/test_json_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.json import JSONDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
index 8fff3edd2..9fcf09c0c 100644
--- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
+++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.svmlight import SVMLightDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
index 4086e127e..ed4dec348 100644
--- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
+++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
@@ -6,11 +6,10 @@
 import matplotlib.pyplot as plt
 import pytest
 from kedro.io import DataSetError, Version
+from kedro_datasets.matplotlib import MatplotlibWriter
 from moto import mock_s3
 from s3fs import S3FileSystem
 
-from kedro_datasets.matplotlib import MatplotlibWriter
-
 BUCKET_NAME = "test_bucket"
 AWS_CREDENTIALS = {"key": "testing", "secret": "testing"}
 KEY_PATH = "matplotlib"
diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py
index a3a89eca7..dd589019d 100644
--- a/kedro-datasets/tests/networkx/test_gml_dataset.py
+++ b/kedro-datasets/tests/networkx/test_gml_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.networkx import GMLDataSet
+from s3fs.core import S3FileSystem
 
 ATTRS = {
     "source": "from",
diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py
index 4e0dcf40d..9ff22883e 100644
--- a/kedro-datasets/tests/networkx/test_graphml_dataset.py
+++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.networkx import GraphMLDataSet
+from s3fs.core import S3FileSystem
 
 ATTRS = {
     "source": "from",
diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py
index 4d6e582a8..ed437f69a 100644
--- a/kedro-datasets/tests/networkx/test_json_dataset.py
+++ b/kedro-datasets/tests/networkx/test_json_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.networkx import JSONDataSet
+from s3fs.core import S3FileSystem
 
 ATTRS = {
     "source": "from",
diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py
index 5cc1ee36b..53a1e7c52 100644
--- a/kedro-datasets/tests/pandas/test_csv_dataset.py
+++ b/kedro-datasets/tests/pandas/test_csv_dataset.py
@@ -12,12 +12,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.pandas import CSVDataSet
 from moto import mock_s3
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import CSVDataSet
-
 BUCKET_NAME = "test_bucket"
 FILE_NAME = "test.csv"
 
diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py
index 1080cc9b6..bae8c5147 100644
--- a/kedro-datasets/tests/pandas/test_excel_dataset.py
+++ b/kedro-datasets/tests/pandas/test_excel_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import ExcelDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import ExcelDataSet
-
 
 @pytest.fixture
 def filepath_excel(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py
index 80c1ce678..ec995d657 100644
--- a/kedro-datasets/tests/pandas/test_feather_dataset.py
+++ b/kedro-datasets/tests/pandas/test_feather_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import FeatherDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import FeatherDataSet
-
 
 @pytest.fixture
 def filepath_feather(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py
index e239dbaba..d970db36e 100644
--- a/kedro-datasets/tests/pandas/test_gbq_dataset.py
+++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py
@@ -4,9 +4,8 @@
 import pytest
 from google.cloud.exceptions import NotFound
 from kedro.io.core import DataSetError
-from pandas.testing import assert_frame_equal
-
 from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet
+from pandas.testing import assert_frame_equal
 
 DATASET = "dataset"
 TABLE_NAME = "table_name"
diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py
index 6f40bb0d4..2526c1ed6 100644
--- a/kedro-datasets/tests/pandas/test_generic_dataset.py
+++ b/kedro-datasets/tests/pandas/test_generic_dataset.py
@@ -9,11 +9,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp
+from kedro_datasets.pandas import GenericDataSet
 from pandas._testing import assert_frame_equal
 from s3fs import S3FileSystem
 
-from kedro_datasets.pandas import GenericDataSet
-
 
 @pytest.fixture
 def filepath_sas(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py
index 563ba63d9..c59e7a104 100644
--- a/kedro-datasets/tests/pandas/test_hdf_dataset.py
+++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import HDFDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import HDFDataSet
-
 HDF_KEY = "data"
 
 
diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py
index df2e856d5..7da50165e 100644
--- a/kedro-datasets/tests/pandas/test_json_dataset.py
+++ b/kedro-datasets/tests/pandas/test_json_dataset.py
@@ -8,11 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import JSONDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import JSONDataSet
-
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py
index 2d7ce2996..cc62ed203 100644
--- a/kedro-datasets/tests/pandas/test_parquet_dataset.py
+++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py
@@ -7,12 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import ParquetDataSet
 from pandas.testing import assert_frame_equal
 from pyarrow.fs import FSSpecHandler, PyFileSystem
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import ParquetDataSet
-
 FILENAME = "test.parquet"
 
 
diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py
index 308582859..b810748c2 100644
--- a/kedro-datasets/tests/pandas/test_sql_dataset.py
+++ b/kedro-datasets/tests/pandas/test_sql_dataset.py
@@ -6,7 +6,6 @@
 import pytest
 import sqlalchemy
 from kedro.io import DataSetError
-
 from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet
 
 TABLE_NAME = "table_a"
diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py
index bd62ea586..65be88174 100644
--- a/kedro-datasets/tests/pandas/test_xml_dataset.py
+++ b/kedro-datasets/tests/pandas/test_xml_dataset.py
@@ -8,11 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import XMLDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import XMLDataSet
-
 
 @pytest.fixture
 def filepath_xml(tmp_path):
diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py
index fb95681a3..2846201cf 100644
--- a/kedro-datasets/tests/pickle/test_pickle_dataset.py
+++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py
@@ -8,11 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pickle import PickleDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pickle import PickleDataSet
-
 
 @pytest.fixture
 def filepath_pickle(tmp_path):
diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py
index ea500b20d..ed27e3cb9 100644
--- a/kedro-datasets/tests/pillow/test_image_dataset.py
+++ b/kedro-datasets/tests/pillow/test_image_dataset.py
@@ -6,11 +6,10 @@
 from fsspec.implementations.local import LocalFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.pillow import ImageDataSet
 from PIL import Image, ImageChops
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pillow import ImageDataSet
-
 
 @pytest.fixture
 def filepath_png(tmp_path):
diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py
index ab6e17d9c..0115a72dd 100644
--- a/kedro-datasets/tests/plotly/test_json_dataset.py
+++ b/kedro-datasets/tests/plotly/test_json_dataset.py
@@ -8,9 +8,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.plotly import JSONDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py
index a422060e8..9b33492bf 100644
--- a/kedro-datasets/tests/plotly/test_plotly_dataset.py
+++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py
@@ -8,12 +8,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
+from kedro_datasets.plotly import PlotlyDataSet
 from plotly import graph_objects
 from plotly.graph_objs import Scatter
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.plotly import PlotlyDataSet
-
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py
index d79183539..4c0807d91 100644
--- a/kedro-datasets/tests/polars/test_csv_dataset.py
+++ b/kedro-datasets/tests/polars/test_csv_dataset.py
@@ -12,12 +12,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.polars import CSVDataSet
 from moto import mock_s3
 from polars.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.polars import CSVDataSet
-
 BUCKET_NAME = "test_bucket"
 FILE_NAME = "test.csv"
 
diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py
index eaa8abbd2..ddda22c17 100644
--- a/kedro-datasets/tests/redis/test_redis_dataset.py
+++ b/kedro-datasets/tests/redis/test_redis_dataset.py
@@ -8,9 +8,8 @@
 import pytest
 import redis
 from kedro.io import DataSetError
-from pandas.testing import assert_frame_equal
-
 from kedro_datasets.redis import PickleDataSet
+from pandas.testing import assert_frame_equal
 
 
 @pytest.fixture(params=["pickle"])
diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
index 2133953b5..d73731df2 100644
--- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
+++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
@@ -6,7 +6,6 @@
 
 try:
     import snowflake.snowpark as sp
-
     from kedro_datasets.snowflake import SnowparkTableDataSet as spds
 except ImportError:
     pass  # this is only for test discovery to succeed on Python <> 3.8
diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py
index 5cbbe62b7..430c78ea2 100644
--- a/kedro-datasets/tests/spark/test_deltatable_dataset.py
+++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py
@@ -4,12 +4,11 @@
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
 from kedro.runner import ParallelRunner
+from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet
-
 
 @pytest.fixture
 def sample_spark_df():
diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py
index 9452b007d..9a3e58035 100644
--- a/kedro-datasets/tests/spark/test_spark_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_dataset.py
@@ -12,6 +12,10 @@
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
 from kedro.runner import ParallelRunner, SequentialRunner
+from kedro_datasets.pandas import CSVDataSet, ParquetDataSet
+from kedro_datasets.pickle import PickleDataSet
+from kedro_datasets.spark import SparkDataSet
+from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils
 from moto import mock_s3
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import col
@@ -24,11 +28,6 @@
 )
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.pandas import CSVDataSet, ParquetDataSet
-from kedro_datasets.pickle import PickleDataSet
-from kedro_datasets.spark import SparkDataSet
-from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils
-
 FOLDER_NAME = "fake_folder"
 FILENAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py
index e0b8fc333..88c18aee6 100644
--- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py
@@ -5,13 +5,12 @@
 
 import pytest
 from kedro.io import DataSetError
+from kedro_datasets.spark import SparkHiveDataSet
 from psutil import Popen
 from pyspark import SparkContext
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
-from kedro_datasets.spark import SparkHiveDataSet
-
 TESTSPARKDIR = "test_spark_dir"
 
 
diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
index 0f3d0e66b..73e091ef9 100644
--- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
@@ -2,7 +2,6 @@
 
 import pytest
 from kedro.io import DataSetError
-
 from kedro_datasets.spark import SparkJDBCDataSet
 
 
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index f2fd3bb3d..fe59c5810 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,10 +1,11 @@
 import json
+
 import pytest
-from pyspark.sql import SparkSession
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from kedro.io.core import DataSetError
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
+from pyspark.sql import SparkSession
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
 
 def sample_schema(schema_path):
diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py
index 733cc6c1f..a4bee6896 100644
--- a/kedro-datasets/tests/text/test_text_dataset.py
+++ b/kedro-datasets/tests/text/test_text_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.text import TextDataSet
+from s3fs.core import S3FileSystem
 
 STRING = "Write to text file."
 
diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py
index 62172b1a4..2529868c4 100644
--- a/kedro-datasets/tests/tracking/test_json_dataset.py
+++ b/kedro-datasets/tests/tracking/test_json_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.tracking import JSONDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py
index 2c1157de9..ad9f4a1cb 100644
--- a/kedro-datasets/tests/tracking/test_metrics_dataset.py
+++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.tracking import MetricsDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/video/conftest.py b/kedro-datasets/tests/video/conftest.py
index 7a0a4c87b..0dd5576dc 100644
--- a/kedro-datasets/tests/video/conftest.py
+++ b/kedro-datasets/tests/video/conftest.py
@@ -1,11 +1,10 @@
 from pathlib import Path
 
 import pytest
+from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
 from PIL import Image
 from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH
 
-from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
-
 
 @pytest.fixture(scope="module")
 def red_frame():
diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py
index 1ac3d1ce4..b4428c4df 100644
--- a/kedro-datasets/tests/video/test_video_dataset.py
+++ b/kedro-datasets/tests/video/test_video_dataset.py
@@ -1,11 +1,10 @@
 import boto3
 import pytest
 from kedro.io import DataSetError
-from moto import mock_s3
-from utils import TEST_FPS, assert_videos_equal
-
 from kedro_datasets.video import VideoDataSet
 from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo
+from moto import mock_s3
+from utils import TEST_FPS, assert_videos_equal
 
 S3_BUCKET_NAME = "test_bucket"
 S3_KEY_PATH = "video"
diff --git a/kedro-datasets/tests/video/test_video_objects.py b/kedro-datasets/tests/video/test_video_objects.py
index 1cb7cca75..3adb701d2 100644
--- a/kedro-datasets/tests/video/test_video_objects.py
+++ b/kedro-datasets/tests/video/test_video_objects.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
 from utils import (
     DEFAULT_FOURCC,
     MJPEG_FOURCC,
@@ -21,8 +22,6 @@
     assert_images_equal,
 )
 
-from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
-
 
 class TestSequenceVideo:
     def test_sequence_video_indexing_first(self, color_video, red_frame):
diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py
index 653606c17..2cadeee7d 100644
--- a/kedro-datasets/tests/yaml/test_yaml_dataset.py
+++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.yaml import YAMLDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.yaml import YAMLDataSet
-
 
 @pytest.fixture
 def filepath_yaml(tmp_path):
diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py
index 0306c1e2f..2c680fd70 100644
--- a/kedro-docker/features/steps/cli_steps.py
+++ b/kedro-docker/features/steps/cli_steps.py
@@ -8,7 +8,6 @@
 import behave
 import yaml
 from behave import given, then, when
-
 from features.steps.sh_run import ChildTerminatingPopen, run
 from features.steps.util import (
     TimeoutException,
diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py
index 27af7db96..cc8dda1c4 100644
--- a/kedro-docker/kedro_docker/plugin.py
+++ b/kedro-docker/kedro_docker/plugin.py
@@ -125,9 +125,9 @@ def docker_init(spark):
     if KEDRO_VERSION.match(">=0.17.0"):
         verbose = KedroCliError.VERBOSE_ERROR
     else:
-        from kedro.framework.cli.cli import (  # noqa # pylint:disable=import-outside-toplevel, no-name-in-module
+        from kedro.framework.cli.cli import (
             _VERBOSE as verbose,
-        )
+        )  # noqa # pylint:disable=import-outside-toplevel, no-name-in-module
 
     docker_file_version = "spark" if spark else "simple"
     docker_file = f"Dockerfile.{docker_file_version}"
diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py
index 40b5d9306..f205c9efe 100644
--- a/kedro-docker/tests/test_helpers.py
+++ b/kedro-docker/tests/test_helpers.py
@@ -3,7 +3,6 @@
 
 import pytest
 from click import ClickException
-
 from kedro_docker.helpers import (
     add_jupyter_args,
     check_docker_image_exists,
diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py
index 5eeb4d489..1027d541d 100644
--- a/kedro-telemetry/kedro_telemetry/plugin.py
+++ b/kedro-telemetry/kedro_telemetry/plugin.py
@@ -22,7 +22,6 @@
 from kedro.framework.startup import ProjectMetadata
 from kedro.io.data_catalog import DataCatalog
 from kedro.pipeline import Pipeline
-
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
 from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli
 
diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py
index 74773e2f4..1e674096b 100644
--- a/kedro-telemetry/tests/test_masking.py
+++ b/kedro-telemetry/tests/test_masking.py
@@ -9,7 +9,6 @@
 from kedro import __version__ as kedro_version
 from kedro.framework.cli.cli import KedroCLI, cli
 from kedro.framework.startup import ProjectMetadata
-
 from kedro_telemetry.masking import (
     MASK,
     _get_cli_structure,
diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py
index 222bcc914..9b1a6460b 100644
--- a/kedro-telemetry/tests/test_plugin.py
+++ b/kedro-telemetry/tests/test_plugin.py
@@ -9,8 +9,6 @@
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
-from pytest import fixture
-
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
 from kedro_telemetry.plugin import (
     KedroTelemetryCLIHooks,
@@ -18,6 +16,7 @@
     _check_for_telemetry_consent,
     _confirm_consent,
 )
+from pytest import fixture
 
 REPO_NAME = "dummy_project"
 PACKAGE_NAME = "dummy_package"
diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py
index dd05d4c5a..e8f5d8449 100755
--- a/tools/circleci/circleci_release.py
+++ b/tools/circleci/circleci_release.py
@@ -8,7 +8,6 @@
 
 import requests
 from requests.structures import CaseInsensitiveDict
-
 from utils.check_no_version_pypi import check_no_version_pypi
 from utils.package_version import get_package_version
 

From 9e8f55cea58eeb484a05ec70a724feacddb52ecb Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 1 May 2023 19:08:07 +0100
Subject: [PATCH 24/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index fe59c5810..82b90481c 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -2,11 +2,12 @@
 
 import pytest
 from kedro.io.core import DataSetError
-from kedro_datasets.spark.spark_dataset import SparkDataSet
-from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
+from kedro_datasets.spark.spark_dataset import SparkDataSet
+from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
+
 
 def sample_schema(schema_path):
     with open(schema_path, encoding="utf-8") as f:
@@ -29,7 +30,7 @@ def sample_spark_streaming_df(tmp_path):
     )
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
     schema_path = (tmp_path / "test.json").as_posix()
-    with open(schema_path, "w") as f:
+    with open(schema_path, "w", encoding="utf-8") as f:
         json.dump(schema.jsonValue(), f)
     return SparkSession.builder.getOrCreate().createDataFrame(data, schema)
 

From dbdf19c61acb021506a07e57bd3ae504d2c04a84 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 1 May 2023 19:08:44 +0100
Subject: [PATCH 25/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 203539a11..79a044c6d 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -13,13 +13,14 @@
     get_filepath_str,
     get_protocol_and_path,
 )
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark import SparkConf
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
 from pyspark.sql.utils import AnalysisException
 from yaml.loader import SafeLoader
 
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
+
 
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
@@ -47,6 +48,7 @@ class SparkStreamingDataSet(AbstractDataSet):
 
     """
 
+    # pylint: disable=too-many-instance-attributes
     DEFAULT_LOAD_ARGS = {}  # type: Dict[str, Any]
     DEFAULT_SAVE_ARGS = {}  # type: Dict[str, Any]
 
@@ -156,7 +158,8 @@ def _get_spark():
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.
-        If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args
+        If the connector type is kafka then no file_path is required, schema needs to be
+        seperated from load_args.
 
         Returns:
             Data from filepath as pyspark dataframe.
@@ -211,8 +214,8 @@ def _exists(self, schema_path: str) -> bool:
             schema_path: schema of saved streaming dataframe
         """
         load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
-        with open(schema_path, encoding="utf-8") as f:
-            schema = StructType.fromJson(json.loads(f.read()))
+        with open(schema_path, encoding="utf-8") as schema_file:
+            schema = StructType.fromJson(json.loads(schema_file.read()))
         try:
             self._get_spark().readStream.schema(schema).load(
                 load_path, self._file_format

From 1a7a4776c86f8835dd2ee3e95562555d3e1ecbe2 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Wed, 12 Apr 2023 13:41:49 +0100
Subject: [PATCH 26/96] add spark_stream_dataset.py

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_stream_dataset.py             | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
new file mode 100644
index 000000000..6844e04cf
--- /dev/null
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -0,0 +1,128 @@
+"""SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
+from typing import Any, Dict
+
+import pyspark
+import yaml
+from kedro.io import AbstractDataSet
+from pyspark import SparkConf
+from pyspark.sql import SparkSession
+from yaml.loader import SafeLoader
+
+
+class SparkStreamingDataSet(AbstractDataSet):
+    """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
+
+    Example usage for the
+    `YAML API <https://kedro.readthedocs.io/en/stable/data/\
+    data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+    .. code-block:: yaml
+
+        raw.new_inventory:
+            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            filepath: data/01_raw/stream/inventory/
+            file_format: json
+
+        int.new_inventory:
+            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            filepath: data/02_intermediate/inventory/
+            file_format: csv
+            save_args:
+                output_mode: append
+                checkpoint: data/04_checkpoint/int_new_inventory
+                header: True
+            load_args:
+                header: True
+
+    """
+
+    def __init__(
+        self,
+        filepath: str = "",
+        file_format: str = "",
+        save_args: Dict[str, str] = {},
+        load_args: Dict[str, str] = {},
+    ):
+        """Creates a new instance of SparkStreamingDataSet.
+
+        Args:
+            filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks
+                specify ``filepath``s starting with ``/dbfs/``. For message brokers such as
+                Kafka and all filepath is not required.
+            file_format: File format used during load and save
+                operations. These are formats supported by the running
+                SparkContext include parquet, csv, delta. For a list of supported
+                formats please refer to Apache Spark documentation at
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+            load_args: Load args passed to Spark DataFrameReader load method.
+                It is dependent on the selected file format. You can find
+                a list of read options for each supported format
+                in Spark DataFrame read documentation:
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+            save_args: Save args passed to Spark DataFrame write options.
+                Similar to load_args this is dependent on the selected file
+                format. You can pass ``mode`` and ``partitionBy`` to specify
+                your overwrite mode and partitioning respectively. You can find
+                a list of options for each format in Spark DataFrame
+                write documentation:
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+        """
+        self._filepath_ = filepath
+        self.file_format = file_format
+        self._save_args = save_args
+        self._load_args = load_args
+        self.output_format = [
+            "kafka"
+        ]  # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving.
+
+        # read spark configuration from spark yml file and create a spark context
+        with open("conf/base/spark.yml") as f:
+            self.parameters = yaml.load(f, Loader=SafeLoader)
+        self.spark_conf = SparkConf().setAll(self.parameters.items())
+
+        # Initialise the spark session
+        self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf)
+        self.spark = self.spark_session_conf.getOrCreate()
+
+    def _load(self) -> pyspark.sql.DataFrame:
+        """Loads data from filepath.
+        If the connector type is kafka then no file_path is required
+
+        Returns:
+            Data from filepath as pyspark dataframe.
+        """
+        input_constructor = self.spark.readStream.format(self.file_format).options(
+            **self._load_args
+        )
+        return (
+            input_constructor.load()
+            if self.file_format
+            in self.output_format  # if the connector type is message broker
+            else input_constructor.load(self._filepath_)
+        )
+
+    def _save(self, data: pyspark.sql.DataFrame) -> None:
+        """Saves pyspark dataframe.
+
+        Args:
+            data: PySpark streaming dataframe for saving
+
+        """
+
+        output_constructor = data.writeStream.format(self.file_format)
+
+        # for message brokers path is not needed
+        if self.file_format not in self.output_format:
+            output_constructor = output_constructor.option("path", self._filepath_)
+
+        (
+            output_constructor.option(
+                "checkpointLocation", self._save_args.pop("checkpoint")
+            )
+            .outputMode(self._save_args.pop("output_mode"))
+            .options(**self._save_args)
+            .start()
+        )
+
+    def _describe(self) -> Dict[str, Any]:
+        """Returns a dict that describes attributes of the dataset."""
+        return None

From e8779442f48083430453ffdc4606c9c1f3a0a3a3 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 13 Apr 2023 11:48:57 +0100
Subject: [PATCH 27/96] restructure the strean dataset to align with the other
 spark dataset

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_stream_dataset.py             | 57 ++++++++++++++-----
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
index 6844e04cf..0992ab5ce 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -1,13 +1,13 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
 from typing import Any, Dict
-
-import pyspark
+from copy import deepcopy
 import yaml
 from kedro.io import AbstractDataSet
 from pyspark import SparkConf
-from pyspark.sql import SparkSession
+from pathlib import PurePosixPath
+from pyspark.sql import SparkSession, DataFrame
 from yaml.loader import SafeLoader
-
+from kedro_datasets.spark.spark_dataset import _split_filepath
 
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
@@ -35,13 +35,16 @@ class SparkStreamingDataSet(AbstractDataSet):
 
     """
 
+    DEFAULT_LOAD_ARGS = {}  # type: Dict[str, Any]
+    DEFAULT_SAVE_ARGS = {}  # type: Dict[str, Any]
+
     def __init__(
         self,
         filepath: str = "",
         file_format: str = "",
-        save_args: Dict[str, str] = {},
-        load_args: Dict[str, str] = {},
-    ):
+        save_args: Dict[str, Any] = None,
+        load_args: Dict[str, Any] = None,
+    ) -> None:
         """Creates a new instance of SparkStreamingDataSet.
 
         Args:
@@ -74,23 +77,46 @@ def __init__(
             "kafka"
         ]  # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving.
 
+        fs_prefix, filepath = _split_filepath(filepath)
+
+        self._fs_prefix = fs_prefix
+        self._filepath = PurePosixPath(filepath)
+
+        # Handle default load and save arguments
+        self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
+        if load_args is not None:
+            self._load_args.update(load_args)
+        self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
+        if save_args is not None:
+            self._save_args.update(save_args)
+
+    def _describe(self) -> Dict[str, Any]:
+        """Returns a dict that describes attributes of the dataset."""
+        return {
+            "filepath": self._fs_prefix + str(self._filepath),
+            "file_format": self._file_format,
+            "load_args": self._load_args,
+            "save_args": self._save_args,
+        }
+
+    @staticmethod
+    def _get_spark(self):
         # read spark configuration from spark yml file and create a spark context
         with open("conf/base/spark.yml") as f:
             self.parameters = yaml.load(f, Loader=SafeLoader)
         self.spark_conf = SparkConf().setAll(self.parameters.items())
 
         # Initialise the spark session
-        self.spark_session_conf = SparkSession.builder.config(conf=self.spark_conf)
-        self.spark = self.spark_session_conf.getOrCreate()
+        return SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
 
-    def _load(self) -> pyspark.sql.DataFrame:
+    def _load(self) -> DataFrame:
         """Loads data from filepath.
         If the connector type is kafka then no file_path is required
 
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = self.spark.readStream.format(self.file_format).options(
+        input_constructor = self._get_spark().readStream.format(self.file_format).options(
             **self._load_args
         )
         return (
@@ -100,7 +126,7 @@ def _load(self) -> pyspark.sql.DataFrame:
             else input_constructor.load(self._filepath_)
         )
 
-    def _save(self, data: pyspark.sql.DataFrame) -> None:
+    def _save(self, data: DataFrame) -> None:
         """Saves pyspark dataframe.
 
         Args:
@@ -123,6 +149,7 @@ def _save(self, data: pyspark.sql.DataFrame) -> None:
             .start()
         )
 
-    def _describe(self) -> Dict[str, Any]:
-        """Returns a dict that describes attributes of the dataset."""
-        return None
+
+
+
+

From 09e9cf2649175495bec87e3ea0fb7383eee00b4a Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 13 Apr 2023 17:43:56 +0100
Subject: [PATCH 28/96] adding README.md for specification

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 38 +++++++++++++++++++
 .../spark/spark_stream_dataset.py             | 17 +++++----
 2 files changed, 48 insertions(+), 7 deletions(-)
 create mode 100644 kedro-datasets/kedro_datasets/spark/README.md

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
new file mode 100644
index 000000000..bded31532
--- /dev/null
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -0,0 +1,38 @@
+# Spark Streaming
+
+``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames.
+See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
+
+To work with multiple streaming nodes, 2 hook are required for: 
+    - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details
+    - Running streaming query without termination unless exception
+
+#### Example SparkStreamsHook:
+
+```python
+from kedro.framework.hooks import hook_impl
+from pyspark.sql import SparkSession
+
+class SparkStreamsHook:
+    @hook_impl
+    def after_pipeline_run(self) -> None:
+        """Starts a spark streaming await session
+        once the pipeline reaches the last node
+        """
+
+        spark = SparkSession.builder.getOrCreate()
+        spark.streams.awaitAnyTermination()
+```
+To make the application work with kafka format, respective spark configuration need to be added in ``conf/base/spark.yml``.
+
+#### Example spark.yml:
+
+```yaml
+spark.driver.maxResultSize: 3g
+spark.scheduler.mode: FAIR
+spark.sql.streaming.schemaInference: True
+spark.streaming.stopGracefullyOnShutdown: true # graceful shutdown guarantees (under some conditions, listed below in the post) that all received data is processed before destroying Spark context
+spark.sql.streaming.stateStore.stateSchemaCheck: false # since schema is not mentioned explicitly
+spark.jars.packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 # spark and kafka configuraton for reading kafka files (not required if kafka is not used)
+
+```
diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
index 0992ab5ce..382c45286 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -1,4 +1,5 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
+import os
 from typing import Any, Dict
 from copy import deepcopy
 import yaml
@@ -101,13 +102,15 @@ def _describe(self) -> Dict[str, Any]:
 
     @staticmethod
     def _get_spark(self):
-        # read spark configuration from spark yml file and create a spark context
-        with open("conf/base/spark.yml") as f:
-            self.parameters = yaml.load(f, Loader=SafeLoader)
-        self.spark_conf = SparkConf().setAll(self.parameters.items())
-
-        # Initialise the spark session
-        return SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
+        spark_conf_path = "conf/base/spark.yml"
+        if os.path.exists(spark_conf_path):
+            with open(spark_conf_path) as f:
+                self.parameters = yaml.load(f, Loader=SafeLoader)
+            self.spark_conf = SparkConf().setAll(self.parameters.items())
+            spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
+        else:
+            spark = SparkSession.builder.getOrCreate()
+        return spark
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.

From 2e30ec07941be9f1d5c4e4866d2c0025381ed068 Mon Sep 17 00:00:00 2001
From: Tingting Wan <110382691+Tingting711@users.noreply.github.com>
Date: Fri, 14 Apr 2023 01:40:10 +0800
Subject: [PATCH 29/96] Update
 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py

Co-authored-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
index 382c45286..77bf62f40 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
@@ -19,7 +19,7 @@ class SparkStreamingDataSet(AbstractDataSet):
     .. code-block:: yaml
 
         raw.new_inventory:
-            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            type: spark.SparkStreamingDataSet
             filepath: data/01_raw/stream/inventory/
             file_format: json
 

From 6147636c7be7f7131b2534ee81c7a397ec8277ea Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 13 Apr 2023 18:41:12 +0100
Subject: [PATCH 30/96] rename the dataset

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/{spark_stream_dataset.py => spark_streaming_dataset.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename kedro-datasets/kedro_datasets/spark/{spark_stream_dataset.py => spark_streaming_dataset.py} (100%)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
similarity index 100%
rename from kedro-datasets/kedro_datasets/spark/spark_stream_dataset.py
rename to kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py

From 29376e94d5edb736569e59594edf22c224aa1cf6 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 17 Apr 2023 10:19:43 +0100
Subject: [PATCH 31/96] resolve comments

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/__init__.py                | 2 ++
 kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py
index 3dede09aa..c93d3f0df 100644
--- a/kedro-datasets/kedro_datasets/spark/__init__.py
+++ b/kedro-datasets/kedro_datasets/spark/__init__.py
@@ -12,3 +12,5 @@
     from .spark_jdbc_dataset import SparkJDBCDataSet
 with suppress(ImportError):
     from .deltatable_dataset import DeltaTableDataSet
+with suppress(ImportError):
+    from .spark_streaming_dataset import SparkStreamingDataSet
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 77bf62f40..1ee271e87 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -24,7 +24,7 @@ class SparkStreamingDataSet(AbstractDataSet):
             file_format: json
 
         int.new_inventory:
-            type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+            type: spark.SparkStreamingDataSet
             filepath: data/02_intermediate/inventory/
             file_format: csv
             save_args:

From 42ed37a38537d01b35c6e615a2f3d71493984382 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 17 Apr 2023 11:42:05 +0100
Subject: [PATCH 32/96] fix format and pylint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 1ee271e87..fa6fc9c7e 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -2,14 +2,15 @@
 import os
 from typing import Any, Dict
 from copy import deepcopy
+from pathlib import PurePosixPath
 import yaml
 from kedro.io import AbstractDataSet
 from pyspark import SparkConf
-from pathlib import PurePosixPath
 from pyspark.sql import SparkSession, DataFrame
 from yaml.loader import SafeLoader
 from kedro_datasets.spark.spark_dataset import _split_filepath
 
+
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
 
@@ -71,12 +72,10 @@ def __init__(
                 https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
         """
         self._filepath_ = filepath
-        self.file_format = file_format
+        self._file_format = file_format
         self._save_args = save_args
         self._load_args = load_args
-        self.output_format = [
-            "kafka"
-        ]  # message broker formats, such as Kafka, Kinesis, and others, require different methods for loading and saving.
+        self.output_format = ["kafka"]
 
         fs_prefix, filepath = _split_filepath(filepath)
 
@@ -101,13 +100,15 @@ def _describe(self) -> Dict[str, Any]:
         }
 
     @staticmethod
-    def _get_spark(self):
+    def _get_spark():
         spark_conf_path = "conf/base/spark.yml"
         if os.path.exists(spark_conf_path):
-            with open(spark_conf_path) as f:
-                self.parameters = yaml.load(f, Loader=SafeLoader)
-            self.spark_conf = SparkConf().setAll(self.parameters.items())
-            spark = SparkSession.builder.config(conf=self.spark_conf).getOrCreate()
+            with open(
+                spark_conf_path, encoding="utf-8"
+            ) as File:  # pylint: disable=invalid-name
+                parameters = yaml.load(File, Loader=SafeLoader)
+            spark_conf = SparkConf().setAll(parameters.items())
+            spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
         else:
             spark = SparkSession.builder.getOrCreate()
         return spark
@@ -119,12 +120,14 @@ def _load(self) -> DataFrame:
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = self._get_spark().readStream.format(self.file_format).options(
-            **self._load_args
+        input_constructor = (
+            self._get_spark()
+            .readStream.format(self._file_format)
+            .options(**self._load_args)
         )
         return (
             input_constructor.load()
-            if self.file_format
+            if self._file_format
             in self.output_format  # if the connector type is message broker
             else input_constructor.load(self._filepath_)
         )
@@ -137,10 +140,10 @@ def _save(self, data: DataFrame) -> None:
 
         """
 
-        output_constructor = data.writeStream.format(self.file_format)
+        output_constructor = data.writeStream.format(self._file_format)
 
         # for message brokers path is not needed
-        if self.file_format not in self.output_format:
+        if self._file_format not in self.output_format:
             output_constructor = output_constructor.option("path", self._filepath_)
 
         (
@@ -151,8 +154,3 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
-
-
-
-
-

From d93d9b9d41a0cb9c29243e1950369fea77e3d5ed Mon Sep 17 00:00:00 2001
From: Tingting Wan <110382691+Tingting711@users.noreply.github.com>
Date: Mon, 17 Apr 2023 21:21:08 +0800
Subject: [PATCH 33/96] Update kedro-datasets/kedro_datasets/spark/README.md

Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index bded31532..f222df00a 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -1,6 +1,6 @@
 # Spark Streaming
 
-``SparkStreamingDatasets`` loads and saves data to spark streaming DatafFrames.
+``SparkStreamingDataSet`` loads and saves data to streaming DataFrames.
 See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
 
 To work with multiple streaming nodes, 2 hook are required for: 

From 5b83444ebd0af2d28f51bd75121ee867b968f76f Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Fri, 21 Apr 2023 23:02:52 +0100
Subject: [PATCH 34/96] add unit tests and SparkStreamingDataset in init.py

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/__init__.py          |  2 +-
 .../spark/spark_streaming_dataset.py          | 14 +++++-
 .../spark/test_spark_streaming_dataset.py     | 47 +++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 kedro-datasets/tests/spark/test_spark_streaming_dataset.py

diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py
index c93d3f0df..0c46a7fc3 100644
--- a/kedro-datasets/kedro_datasets/spark/__init__.py
+++ b/kedro-datasets/kedro_datasets/spark/__init__.py
@@ -1,6 +1,6 @@
 """Provides I/O modules for Apache Spark."""
 
-__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet"]
+__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"]
 
 from contextlib import suppress
 
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index fa6fc9c7e..10680d661 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -6,9 +6,10 @@
 import yaml
 from kedro.io import AbstractDataSet
 from pyspark import SparkConf
+from pyspark.errors.exceptions.captured import AnalysisException
 from pyspark.sql import SparkSession, DataFrame
 from yaml.loader import SafeLoader
-from kedro_datasets.spark.spark_dataset import _split_filepath
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
 
 class SparkStreamingDataSet(AbstractDataSet):
@@ -154,3 +155,14 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
+    def _exists(self) -> bool:
+        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+
+        try:
+            self._get_spark().read.load(path=load_path, format="delta")
+        except AnalysisException as exception:
+            if "is not a Delta table" in exception.desc:
+                return False
+            raise
+
+        return True
\ No newline at end of file
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
new file mode 100644
index 000000000..47a427742
--- /dev/null
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -0,0 +1,47 @@
+import pytest
+import time
+from pyspark.sql import SparkSession
+from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+
+@pytest.fixture
+def sample_spark_streaming_df_one():
+    schema = StructType(
+        [
+            StructField("sku", StringType(), True),
+            StructField("new_stock", IntegerType(), True),
+        ]
+    )
+    data = [("0001", 2), ("0001", 7), ("0002", 4)]
+
+    return SparkSession.builder.getOrCreate() \
+            .createDataFrame(data, schema)
+
+
+class TestStreamingDataSet:
+    def test_load(self,tmp_path, sample_spark_streaming_df_one):
+        filepath = (tmp_path / "test_streams").as_posix()
+        spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"])
+        spark_json_ds.save(sample_spark_streaming_df_one)
+        loaded_with_spark = spark_json_ds.load()
+
+        stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load()
+        assert stream_df.isStreaming
+
+        stream_query = stream_df.writeStream.format("memory").queryName("test").start()
+        assert stream_query.isActive
+        time.sleep(3)
+        stream_query.stop()
+        loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test")
+
+        assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0
+
+
+    def test_save(self, tmp_path, sample_spark_df):
+        filepath = (tmp_path / "test_streams").as_posix()
+        checkpoint_path = (tmp_path / "checkpoint").as_posix()
+        streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path])
+        assert not streaming_ds.exists()
+
+

From 5b0630e11306643ae0d4c1706fe18a501a6b1179 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 25 Apr 2023 10:20:02 +0100
Subject: [PATCH 35/96] add unit tests

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 76 +++++++++++++++----
 .../spark/test_spark_streaming_dataset.py     | 66 ++++++++++------
 2 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 10680d661..a508a3903 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -1,13 +1,17 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
+import json
 import os
 from typing import Any, Dict
 from copy import deepcopy
 from pathlib import PurePosixPath
 import yaml
-from kedro.io import AbstractDataSet
+
+import fsspec
+from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path
 from pyspark import SparkConf
-from pyspark.errors.exceptions.captured import AnalysisException
+from pyspark.sql.utils import AnalysisException
 from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql.types import StructType
 from yaml.loader import SafeLoader
 from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
@@ -91,6 +95,37 @@ def __init__(
         if save_args is not None:
             self._save_args.update(save_args)
 
+        # Handle schema load argument
+        self._schema = self._load_args.pop("schema", None)
+        if self._schema is not None:
+            if isinstance(self._schema, dict):
+                self._schema = self._load_schema_from_file(self._schema)
+
+    @staticmethod
+    def _load_schema_from_file(schema: Dict[str, Any]) -> StructType:
+        filepath = schema.get("filepath")
+        if not filepath:
+            raise DataSetError(
+                "Schema load argument does not specify a 'filepath' attribute. Please"
+                "include a path to a JSON-serialised 'pyspark.sql.types.StructType'."
+            )
+
+        credentials = deepcopy(schema.get("credentials")) or {}
+        protocol, schema_path = get_protocol_and_path(filepath)
+        file_system = fsspec.filesystem(protocol, **credentials)
+        pure_posix_path = PurePosixPath(schema_path)
+        load_path = get_filepath_str(pure_posix_path, protocol)
+
+        # Open schema file
+        with file_system.open(load_path, encoding='utf-8') as fs_file:
+            try:
+                return StructType.fromJson(json.loads(fs_file.read()))
+            except Exception as exc:
+                raise DataSetError(
+                    f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please"
+                    f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'."
+                ) from exc
+
     def _describe(self) -> Dict[str, Any]:
         """Returns a dict that describes attributes of the dataset."""
         return {
@@ -116,16 +151,23 @@ def _get_spark():
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.
-        If the connector type is kafka then no file_path is required
+        If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args
 
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = (
-            self._get_spark()
-            .readStream.format(self._file_format)
-            .options(**self._load_args)
-        )
+        if self._schema:
+            input_constructor = (
+                self._get_spark()
+                .readStream.schema(self._schema).format(self._file_format)
+                .options(**self._load_args)
+            )
+        else:
+            input_constructor = (
+                self._get_spark()
+                .readStream.format(self._file_format)
+                .options(**self._load_args)
+            )
         return (
             input_constructor.load()
             if self._file_format
@@ -155,14 +197,22 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
-    def _exists(self) -> bool:
-        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+    def _exists(self, schema_path:str) -> bool:
+        """Check the existence of pyspark dataframe.
 
+        Args:
+            schema_path: schema of saved streaming dataframe
+        """
+        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+        with open(schema_path, encoding='utf-8') as f:
+            schema = StructType.fromJson(json.loads(f.read()))
         try:
-            self._get_spark().read.load(path=load_path, format="delta")
+            self._get_spark().readStream.schema(schema).load(load_path, self._file_format)
         except AnalysisException as exception:
-            if "is not a Delta table" in exception.desc:
+            if (
+                exception.desc.startswith("Path does not exist:")
+                or "is not a Streaming data" in exception.desc
+            ):
                 return False
             raise
-
         return True
\ No newline at end of file
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 47a427742..2d936b1ce 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,12 +1,27 @@
+import json
 import pytest
 import time
 from pyspark.sql import SparkSession
-from kedro_datasets.spark import SparkStreamingDataSet,SparkDataSet
+from kedro_datasets.pandas import ParquetDataSet
+from kedro.io.core import DataSetError
+from kedro_datasets.spark.spark_dataset import SparkDataSet
+from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
 
+
+def sample_schema(schema_path):
+    with open(schema_path, encoding='utf-8') as f:
+        try:
+            return StructType.fromJson(json.loads(f.read()))
+        except Exception as exc:
+            raise DataSetError(
+                f"Contents of 'schema.filepath' ({schema_path}) are invalid. "
+                f"Schema is required for streaming data load, Please provide a valid schema_path."
+            ) from exc
+
 @pytest.fixture
-def sample_spark_streaming_df_one():
+def sample_spark_streaming_df(tmp_path):
     schema = StructType(
         [
             StructField("sku", StringType(), True),
@@ -14,34 +29,41 @@ def sample_spark_streaming_df_one():
         ]
     )
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
-
-    return SparkSession.builder.getOrCreate() \
-            .createDataFrame(data, schema)
+    schema_path = (tmp_path / "test.json").as_posix()
+    with open(schema_path, "w") as f:
+        json.dump(schema.jsonValue(), f)
+    return SparkSession.builder.getOrCreate().createDataFrame(data, schema)
 
 
 class TestStreamingDataSet:
-    def test_load(self,tmp_path, sample_spark_streaming_df_one):
+    def test_load(self, tmp_path, sample_spark_streaming_df):
         filepath = (tmp_path / "test_streams").as_posix()
-        spark_json_ds = SparkDataSet(filepath=filepath, file_format="json",save_args=["mode","overwrite"])
-        spark_json_ds.save(sample_spark_streaming_df_one)
-        loaded_with_spark = spark_json_ds.load()
+        schema_path = (tmp_path / "test.json").as_posix()
 
-        stream_df = SparkStreamingDataSet(filepath=filepath, file_format="json")._load()
-        assert stream_df.isStreaming
+        spark_json_ds = SparkDataSet(
+            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+        )
+        spark_json_ds.save(sample_spark_streaming_df)
 
-        stream_query = stream_df.writeStream.format("memory").queryName("test").start()
-        assert stream_query.isActive
-        time.sleep(3)
-        stream_query.stop()
-        loaded_memory_stream = SparkSession.builder.getOrCreate().sql("select * from test")
+        streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json",
+                                          load_args={"schema": {"filepath": schema_path}}).load()
+        assert streaming_ds.isStreaming
+        schema = sample_schema(schema_path)
+        assert streaming_ds.schema == schema
 
-        assert loaded_memory_stream.exceptAll(loaded_with_spark).count()==0
+    def test_save(self, tmp_path, sample_spark_streaming_df):
+        filepath = (tmp_path / "test_streams_input").as_posix()
+        schema_path = (tmp_path / "test.json").as_posix()
+        checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
+        spark_json_ds = SparkDataSet(
+            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+        )
+        spark_json_ds.save(sample_spark_streaming_df)
 
-    def test_save(self, tmp_path, sample_spark_df):
-        filepath = (tmp_path / "test_streams").as_posix()
-        checkpoint_path = (tmp_path / "checkpoint").as_posix()
-        streaming_ds = SparkStreamingDataSet(filepath=filepath, save_args=["checkpointLocation",checkpoint_path])
-        assert not streaming_ds.exists()
+        streaming_ds = SparkStreamingDataSet(
+            filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
+        )
+        assert streaming_ds._exists(schema_path)
 
 

From 1433808e2d1940f4be6287f67c73abf2a60c76d0 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 25 Apr 2023 10:41:59 +0100
Subject: [PATCH 36/96] update test_save

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../tests/spark/test_spark_streaming_dataset.py    | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 2d936b1ce..fa3b0fec8 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -52,18 +52,26 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         assert streaming_ds.schema == schema
 
     def test_save(self, tmp_path, sample_spark_streaming_df):
-        filepath = (tmp_path / "test_streams_input").as_posix()
+        filepath_json = (tmp_path / "test_streams").as_posix()
+        filepath_output = (tmp_path / "test_streams_output").as_posix()
         schema_path = (tmp_path / "test.json").as_posix()
         checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
         spark_json_ds = SparkDataSet(
-            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+            filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}]
         )
         spark_json_ds.save(sample_spark_streaming_df)
+        loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json",
+                                          load_args={"schema": {"filepath": schema_path}}).load()
+
 
         streaming_ds = SparkStreamingDataSet(
-            filepath=filepath, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
+            filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
         )
+        assert not streaming_ds._exists(schema_path)
+
+        streaming_ds.save(loaded_with_streaming)
         assert streaming_ds._exists(schema_path)
 
 
+

From c7778b57932f47db2335acb99c3bd0cbad6655b8 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 25 Apr 2023 10:45:45 +0100
Subject: [PATCH 37/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 23 ++++++++----
 .../spark/test_spark_streaming_dataset.py     | 36 ++++++++++---------
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index a508a3903..4cb19e6e5 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -7,7 +7,12 @@
 import yaml
 
 import fsspec
-from kedro.io.core import AbstractDataSet,DataSetError, get_filepath_str, get_protocol_and_path
+from kedro.io.core import (
+    AbstractDataSet,
+    DataSetError,
+    get_filepath_str,
+    get_protocol_and_path,
+)
 from pyspark import SparkConf
 from pyspark.sql.utils import AnalysisException
 from pyspark.sql import SparkSession, DataFrame
@@ -117,7 +122,7 @@ def _load_schema_from_file(schema: Dict[str, Any]) -> StructType:
         load_path = get_filepath_str(pure_posix_path, protocol)
 
         # Open schema file
-        with file_system.open(load_path, encoding='utf-8') as fs_file:
+        with file_system.open(load_path, encoding="utf-8") as fs_file:
             try:
                 return StructType.fromJson(json.loads(fs_file.read()))
             except Exception as exc:
@@ -159,7 +164,8 @@ def _load(self) -> DataFrame:
         if self._schema:
             input_constructor = (
                 self._get_spark()
-                .readStream.schema(self._schema).format(self._file_format)
+                .readStream.schema(self._schema)
+                .format(self._file_format)
                 .options(**self._load_args)
             )
         else:
@@ -197,17 +203,20 @@ def _save(self, data: DataFrame) -> None:
             .options(**self._save_args)
             .start()
         )
-    def _exists(self, schema_path:str) -> bool:
+
+    def _exists(self, schema_path: str) -> bool:
         """Check the existence of pyspark dataframe.
 
         Args:
             schema_path: schema of saved streaming dataframe
         """
         load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
-        with open(schema_path, encoding='utf-8') as f:
+        with open(schema_path, encoding="utf-8") as f:
             schema = StructType.fromJson(json.loads(f.read()))
         try:
-            self._get_spark().readStream.schema(schema).load(load_path, self._file_format)
+            self._get_spark().readStream.schema(schema).load(
+                load_path, self._file_format
+            )
         except AnalysisException as exception:
             if (
                 exception.desc.startswith("Path does not exist:")
@@ -215,4 +224,4 @@ def _exists(self, schema_path:str) -> bool:
             ):
                 return False
             raise
-        return True
\ No newline at end of file
+        return True
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index fa3b0fec8..f2fd3bb3d 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,17 +1,14 @@
 import json
 import pytest
-import time
 from pyspark.sql import SparkSession
-from kedro_datasets.pandas import ParquetDataSet
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from kedro.io.core import DataSetError
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-
 
 
 def sample_schema(schema_path):
-    with open(schema_path, encoding='utf-8') as f:
+    with open(schema_path, encoding="utf-8") as f:
         try:
             return StructType.fromJson(json.loads(f.read()))
         except Exception as exc:
@@ -20,6 +17,7 @@ def sample_schema(schema_path):
                 f"Schema is required for streaming data load, Please provide a valid schema_path."
             ) from exc
 
+
 @pytest.fixture
 def sample_spark_streaming_df(tmp_path):
     schema = StructType(
@@ -41,12 +39,15 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         schema_path = (tmp_path / "test.json").as_posix()
 
         spark_json_ds = SparkDataSet(
-            filepath=filepath, file_format="json", save_args=[{"mode","overwrite"}]
+            filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}]
         )
         spark_json_ds.save(sample_spark_streaming_df)
 
-        streaming_ds = SparkStreamingDataSet(filepath=filepath, file_format="json",
-                                          load_args={"schema": {"filepath": schema_path}}).load()
+        streaming_ds = SparkStreamingDataSet(
+            filepath=filepath,
+            file_format="json",
+            load_args={"schema": {"filepath": schema_path}},
+        ).load()
         assert streaming_ds.isStreaming
         schema = sample_schema(schema_path)
         assert streaming_ds.schema == schema
@@ -58,20 +59,23 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
         checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
         spark_json_ds = SparkDataSet(
-            filepath=filepath_json, file_format="json", save_args=[{"mode","overwrite"}]
+            filepath=filepath_json,
+            file_format="json",
+            save_args=[{"mode", "overwrite"}],
         )
         spark_json_ds.save(sample_spark_streaming_df)
-        loaded_with_streaming = SparkStreamingDataSet(filepath=filepath_json, file_format="json",
-                                          load_args={"schema": {"filepath": schema_path}}).load()
-
+        loaded_with_streaming = SparkStreamingDataSet(
+            filepath=filepath_json,
+            file_format="json",
+            load_args={"schema": {"filepath": schema_path}},
+        ).load()
 
         streaming_ds = SparkStreamingDataSet(
-            filepath=filepath_output, file_format="json",save_args={"checkpoint": checkpoint_path, "output_mode":"append"}
+            filepath=filepath_output,
+            file_format="json",
+            save_args={"checkpoint": checkpoint_path, "output_mode": "append"},
         )
         assert not streaming_ds._exists(schema_path)
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds._exists(schema_path)
-
-
-

From 7341429eaf2ad4af8acf707bef3d96a3e06fea3d Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 1 May 2023 17:29:02 +0100
Subject: [PATCH 38/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/__init__.py     |  8 +++++++-
 .../kedro_datasets/spark/deltatable_dataset.py      |  3 +--
 .../kedro_datasets/spark/spark_streaming_dataset.py | 10 +++++-----
 .../kedro_datasets/tracking/json_dataset.py         |  1 -
 .../kedro_datasets/tracking/metrics_dataset.py      |  1 -
 kedro-datasets/setup.py                             | 13 ++++++++-----
 kedro-datasets/tests/api/test_api_dataset.py        |  3 +--
 .../bioinformatics/test_biosequence_dataset.py      |  3 +--
 kedro-datasets/tests/dask/test_parquet_dataset.py   |  3 +--
 kedro-datasets/tests/email/test_message_dataset.py  |  3 +--
 .../tests/geojson/test_geojson_dataset.py           |  3 +--
 .../tests/holoviews/test_holoviews_writer.py        |  3 +--
 kedro-datasets/tests/json/test_json_dataset.py      |  3 +--
 .../tests/libsvm/test_svmlight_dataset.py           |  3 +--
 .../tests/matplotlib/test_matplotlib_writer.py      |  3 +--
 kedro-datasets/tests/networkx/test_gml_dataset.py   |  3 +--
 .../tests/networkx/test_graphml_dataset.py          |  3 +--
 kedro-datasets/tests/networkx/test_json_dataset.py  |  3 +--
 kedro-datasets/tests/pandas/test_csv_dataset.py     |  3 +--
 kedro-datasets/tests/pandas/test_excel_dataset.py   |  3 +--
 kedro-datasets/tests/pandas/test_feather_dataset.py |  3 +--
 kedro-datasets/tests/pandas/test_gbq_dataset.py     |  3 +--
 kedro-datasets/tests/pandas/test_generic_dataset.py |  3 +--
 kedro-datasets/tests/pandas/test_hdf_dataset.py     |  3 +--
 kedro-datasets/tests/pandas/test_json_dataset.py    |  3 +--
 kedro-datasets/tests/pandas/test_parquet_dataset.py |  3 +--
 kedro-datasets/tests/pandas/test_sql_dataset.py     |  1 -
 kedro-datasets/tests/pandas/test_xml_dataset.py     |  3 +--
 kedro-datasets/tests/pickle/test_pickle_dataset.py  |  3 +--
 kedro-datasets/tests/pillow/test_image_dataset.py   |  3 +--
 kedro-datasets/tests/plotly/test_json_dataset.py    |  3 +--
 kedro-datasets/tests/plotly/test_plotly_dataset.py  |  3 +--
 kedro-datasets/tests/polars/test_csv_dataset.py     |  3 +--
 kedro-datasets/tests/redis/test_redis_dataset.py    |  3 +--
 .../tests/snowflake/test_snowpark_dataset.py        |  1 -
 .../tests/spark/test_deltatable_dataset.py          |  3 +--
 kedro-datasets/tests/spark/test_spark_dataset.py    |  9 ++++-----
 .../tests/spark/test_spark_hive_dataset.py          |  3 +--
 .../tests/spark/test_spark_jdbc_dataset.py          |  1 -
 .../tests/spark/test_spark_streaming_dataset.py     |  5 +++--
 kedro-datasets/tests/text/test_text_dataset.py      |  3 +--
 kedro-datasets/tests/tracking/test_json_dataset.py  |  3 +--
 .../tests/tracking/test_metrics_dataset.py          |  3 +--
 kedro-datasets/tests/video/conftest.py              |  3 +--
 kedro-datasets/tests/video/test_video_dataset.py    |  5 ++---
 kedro-datasets/tests/video/test_video_objects.py    |  3 +--
 kedro-datasets/tests/yaml/test_yaml_dataset.py      |  3 +--
 kedro-docker/features/steps/cli_steps.py            |  1 -
 kedro-docker/kedro_docker/plugin.py                 |  4 ++--
 kedro-docker/tests/test_helpers.py                  |  1 -
 kedro-telemetry/kedro_telemetry/plugin.py           |  1 -
 kedro-telemetry/tests/test_masking.py               |  1 -
 kedro-telemetry/tests/test_plugin.py                |  3 +--
 tools/circleci/circleci_release.py                  |  1 -
 54 files changed, 68 insertions(+), 107 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/__init__.py b/kedro-datasets/kedro_datasets/spark/__init__.py
index 0c46a7fc3..bd649f5c7 100644
--- a/kedro-datasets/kedro_datasets/spark/__init__.py
+++ b/kedro-datasets/kedro_datasets/spark/__init__.py
@@ -1,6 +1,12 @@
 """Provides I/O modules for Apache Spark."""
 
-__all__ = ["SparkDataSet", "SparkHiveDataSet", "SparkJDBCDataSet", "DeltaTableDataSet","SparkStreamingDataSet"]
+__all__ = [
+    "SparkDataSet",
+    "SparkHiveDataSet",
+    "SparkJDBCDataSet",
+    "DeltaTableDataSet",
+    "SparkStreamingDataSet",
+]
 
 from contextlib import suppress
 
diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
index 34ee6f6a5..9454a47f7 100644
--- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
@@ -6,11 +6,10 @@
 
 from delta.tables import DeltaTable
 from kedro.io.core import AbstractDataSet, DataSetError
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark.sql import SparkSession
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
-
 
 class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]):
     """``DeltaTableDataSet`` loads data into DeltaTable objects.
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 4cb19e6e5..203539a11 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -1,24 +1,24 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
 import json
 import os
-from typing import Any, Dict
 from copy import deepcopy
 from pathlib import PurePosixPath
-import yaml
+from typing import Any, Dict
 
 import fsspec
+import yaml
 from kedro.io.core import (
     AbstractDataSet,
     DataSetError,
     get_filepath_str,
     get_protocol_and_path,
 )
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark import SparkConf
-from pyspark.sql.utils import AnalysisException
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
+from pyspark.sql.utils import AnalysisException
 from yaml.loader import SafeLoader
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
 
 class SparkStreamingDataSet(AbstractDataSet):
diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py
index 4235df999..994236d3d 100644
--- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py
@@ -5,7 +5,6 @@
 from typing import NoReturn
 
 from kedro.io.core import DataSetError
-
 from kedro_datasets.json import JSONDataSet as JDS
 
 
diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
index 7c7546a85..2e4e2d970 100644
--- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
+++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
@@ -7,7 +7,6 @@
 from typing import Dict, NoReturn
 
 from kedro.io.core import DataSetError, get_filepath_str
-
 from kedro_datasets.json import JSONDataSet
 
 
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index f2f4921a5..be99f9912 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -46,10 +46,15 @@ def _collect_requirements(requires):
     "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"],
     "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"],
 }
-polars_require = {"polars.CSVDataSet": [POLARS],}
+polars_require = {
+    "polars.CSVDataSet": [POLARS],
+}
 redis_require = {"redis.PickleDataSet": ["redis~=4.1"]}
 snowflake_require = {
-    "snowflake.SnowparkTableDataSet": ["snowflake-snowpark-python~=1.0.0", "pyarrow~=8.0"]
+    "snowflake.SnowparkTableDataSet": [
+        "snowflake-snowpark-python~=1.0.0",
+        "pyarrow~=8.0",
+    ]
 }
 spark_require = {
     "spark.SparkDataSet": [SPARK, HDFS, S3FS],
@@ -67,9 +72,7 @@ def _collect_requirements(requires):
         "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
     ]
 }
-video_require = {
-    "video.VideoDataSet": ["opencv-python~=4.5.5.64"]
-}
+video_require = {"video.VideoDataSet": ["opencv-python~=4.5.5.64"]}
 yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]}
 
 extras_require = {
diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py
index 848020041..51279c71c 100644
--- a/kedro-datasets/tests/api/test_api_dataset.py
+++ b/kedro-datasets/tests/api/test_api_dataset.py
@@ -5,9 +5,8 @@
 import pytest
 import requests
 from kedro.io.core import DataSetError
-from requests.auth import HTTPBasicAuth
-
 from kedro_datasets.api import APIDataSet
+from requests.auth import HTTPBasicAuth
 
 POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
 
diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
index 24666baaf..42b3e252f 100644
--- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
+++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
@@ -8,9 +8,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.biosequence import BioSequenceDataSet
+from s3fs.core import S3FileSystem
 
 LOAD_ARGS = {"format": "fasta"}
 SAVE_ARGS = {"format": "fasta"}
diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py
index 8475dbf47..3824d6c0f 100644
--- a/kedro-datasets/tests/dask/test_parquet_dataset.py
+++ b/kedro-datasets/tests/dask/test_parquet_dataset.py
@@ -5,12 +5,11 @@
 import pyarrow.parquet as pq
 import pytest
 from kedro.io import DataSetError
+from kedro_datasets.dask import ParquetDataSet
 from moto import mock_s3
 from pandas.testing import assert_frame_equal
 from s3fs import S3FileSystem
 
-from kedro_datasets.dask import ParquetDataSet
-
 FILE_NAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
 AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}
diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py
index 100daba52..6f97b6c89 100644
--- a/kedro-datasets/tests/email/test_message_dataset.py
+++ b/kedro-datasets/tests/email/test_message_dataset.py
@@ -8,9 +8,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.email import EmailMessageDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py
index b5f3ec4cb..cd6c07c7c 100644
--- a/kedro-datasets/tests/geojson/test_geojson_dataset.py
+++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py
@@ -7,12 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.geopandas import GeoJSONDataSet
 from pandas.testing import assert_frame_equal
 from s3fs import S3FileSystem
 from shapely.geometry import Point
 
-from kedro_datasets.geopandas import GeoJSONDataSet
-
 
 @pytest.fixture(params=[None])
 def load_version(request):
diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py
index f4f91383e..53ca795f2 100644
--- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py
+++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py
@@ -9,9 +9,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.holoviews import HoloviewsWriter
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py
index 621e51fcd..dafdd8e3e 100644
--- a/kedro-datasets/tests/json/test_json_dataset.py
+++ b/kedro-datasets/tests/json/test_json_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.json import JSONDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
index 8fff3edd2..9fcf09c0c 100644
--- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
+++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.svmlight import SVMLightDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
index 4086e127e..ed4dec348 100644
--- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
+++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
@@ -6,11 +6,10 @@
 import matplotlib.pyplot as plt
 import pytest
 from kedro.io import DataSetError, Version
+from kedro_datasets.matplotlib import MatplotlibWriter
 from moto import mock_s3
 from s3fs import S3FileSystem
 
-from kedro_datasets.matplotlib import MatplotlibWriter
-
 BUCKET_NAME = "test_bucket"
 AWS_CREDENTIALS = {"key": "testing", "secret": "testing"}
 KEY_PATH = "matplotlib"
diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py
index a3a89eca7..dd589019d 100644
--- a/kedro-datasets/tests/networkx/test_gml_dataset.py
+++ b/kedro-datasets/tests/networkx/test_gml_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.networkx import GMLDataSet
+from s3fs.core import S3FileSystem
 
 ATTRS = {
     "source": "from",
diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py
index 4e0dcf40d..9ff22883e 100644
--- a/kedro-datasets/tests/networkx/test_graphml_dataset.py
+++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.networkx import GraphMLDataSet
+from s3fs.core import S3FileSystem
 
 ATTRS = {
     "source": "from",
diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py
index 4d6e582a8..ed437f69a 100644
--- a/kedro-datasets/tests/networkx/test_json_dataset.py
+++ b/kedro-datasets/tests/networkx/test_json_dataset.py
@@ -7,9 +7,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.networkx import JSONDataSet
+from s3fs.core import S3FileSystem
 
 ATTRS = {
     "source": "from",
diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py
index 5cc1ee36b..53a1e7c52 100644
--- a/kedro-datasets/tests/pandas/test_csv_dataset.py
+++ b/kedro-datasets/tests/pandas/test_csv_dataset.py
@@ -12,12 +12,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.pandas import CSVDataSet
 from moto import mock_s3
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import CSVDataSet
-
 BUCKET_NAME = "test_bucket"
 FILE_NAME = "test.csv"
 
diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py
index 1080cc9b6..bae8c5147 100644
--- a/kedro-datasets/tests/pandas/test_excel_dataset.py
+++ b/kedro-datasets/tests/pandas/test_excel_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import ExcelDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import ExcelDataSet
-
 
 @pytest.fixture
 def filepath_excel(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py
index 80c1ce678..ec995d657 100644
--- a/kedro-datasets/tests/pandas/test_feather_dataset.py
+++ b/kedro-datasets/tests/pandas/test_feather_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import FeatherDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import FeatherDataSet
-
 
 @pytest.fixture
 def filepath_feather(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py
index e239dbaba..d970db36e 100644
--- a/kedro-datasets/tests/pandas/test_gbq_dataset.py
+++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py
@@ -4,9 +4,8 @@
 import pytest
 from google.cloud.exceptions import NotFound
 from kedro.io.core import DataSetError
-from pandas.testing import assert_frame_equal
-
 from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet
+from pandas.testing import assert_frame_equal
 
 DATASET = "dataset"
 TABLE_NAME = "table_name"
diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py
index 6f40bb0d4..2526c1ed6 100644
--- a/kedro-datasets/tests/pandas/test_generic_dataset.py
+++ b/kedro-datasets/tests/pandas/test_generic_dataset.py
@@ -9,11 +9,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp
+from kedro_datasets.pandas import GenericDataSet
 from pandas._testing import assert_frame_equal
 from s3fs import S3FileSystem
 
-from kedro_datasets.pandas import GenericDataSet
-
 
 @pytest.fixture
 def filepath_sas(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py
index 563ba63d9..c59e7a104 100644
--- a/kedro-datasets/tests/pandas/test_hdf_dataset.py
+++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import HDFDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import HDFDataSet
-
 HDF_KEY = "data"
 
 
diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py
index df2e856d5..7da50165e 100644
--- a/kedro-datasets/tests/pandas/test_json_dataset.py
+++ b/kedro-datasets/tests/pandas/test_json_dataset.py
@@ -8,11 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import JSONDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import JSONDataSet
-
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py
index 2d7ce2996..cc62ed203 100644
--- a/kedro-datasets/tests/pandas/test_parquet_dataset.py
+++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py
@@ -7,12 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import ParquetDataSet
 from pandas.testing import assert_frame_equal
 from pyarrow.fs import FSSpecHandler, PyFileSystem
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import ParquetDataSet
-
 FILENAME = "test.parquet"
 
 
diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py
index 308582859..b810748c2 100644
--- a/kedro-datasets/tests/pandas/test_sql_dataset.py
+++ b/kedro-datasets/tests/pandas/test_sql_dataset.py
@@ -6,7 +6,6 @@
 import pytest
 import sqlalchemy
 from kedro.io import DataSetError
-
 from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet
 
 TABLE_NAME = "table_a"
diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py
index bd62ea586..65be88174 100644
--- a/kedro-datasets/tests/pandas/test_xml_dataset.py
+++ b/kedro-datasets/tests/pandas/test_xml_dataset.py
@@ -8,11 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pandas import XMLDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pandas import XMLDataSet
-
 
 @pytest.fixture
 def filepath_xml(tmp_path):
diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py
index fb95681a3..2846201cf 100644
--- a/kedro-datasets/tests/pickle/test_pickle_dataset.py
+++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py
@@ -8,11 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.pickle import PickleDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pickle import PickleDataSet
-
 
 @pytest.fixture
 def filepath_pickle(tmp_path):
diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py
index ea500b20d..ed27e3cb9 100644
--- a/kedro-datasets/tests/pillow/test_image_dataset.py
+++ b/kedro-datasets/tests/pillow/test_image_dataset.py
@@ -6,11 +6,10 @@
 from fsspec.implementations.local import LocalFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.pillow import ImageDataSet
 from PIL import Image, ImageChops
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.pillow import ImageDataSet
-
 
 @pytest.fixture
 def filepath_png(tmp_path):
diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py
index ab6e17d9c..0115a72dd 100644
--- a/kedro-datasets/tests/plotly/test_json_dataset.py
+++ b/kedro-datasets/tests/plotly/test_json_dataset.py
@@ -8,9 +8,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.plotly import JSONDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py
index a422060e8..9b33492bf 100644
--- a/kedro-datasets/tests/plotly/test_plotly_dataset.py
+++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py
@@ -8,12 +8,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
+from kedro_datasets.plotly import PlotlyDataSet
 from plotly import graph_objects
 from plotly.graph_objs import Scatter
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.plotly import PlotlyDataSet
-
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py
index d79183539..4c0807d91 100644
--- a/kedro-datasets/tests/polars/test_csv_dataset.py
+++ b/kedro-datasets/tests/polars/test_csv_dataset.py
@@ -12,12 +12,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
+from kedro_datasets.polars import CSVDataSet
 from moto import mock_s3
 from polars.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.polars import CSVDataSet
-
 BUCKET_NAME = "test_bucket"
 FILE_NAME = "test.csv"
 
diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py
index eaa8abbd2..ddda22c17 100644
--- a/kedro-datasets/tests/redis/test_redis_dataset.py
+++ b/kedro-datasets/tests/redis/test_redis_dataset.py
@@ -8,9 +8,8 @@
 import pytest
 import redis
 from kedro.io import DataSetError
-from pandas.testing import assert_frame_equal
-
 from kedro_datasets.redis import PickleDataSet
+from pandas.testing import assert_frame_equal
 
 
 @pytest.fixture(params=["pickle"])
diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
index 2133953b5..d73731df2 100644
--- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
+++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
@@ -6,7 +6,6 @@
 
 try:
     import snowflake.snowpark as sp
-
     from kedro_datasets.snowflake import SnowparkTableDataSet as spds
 except ImportError:
     pass  # this is only for test discovery to succeed on Python <> 3.8
diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py
index 5cbbe62b7..430c78ea2 100644
--- a/kedro-datasets/tests/spark/test_deltatable_dataset.py
+++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py
@@ -4,12 +4,11 @@
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
 from kedro.runner import ParallelRunner
+from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet
-
 
 @pytest.fixture
 def sample_spark_df():
diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py
index 9452b007d..9a3e58035 100644
--- a/kedro-datasets/tests/spark/test_spark_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_dataset.py
@@ -12,6 +12,10 @@
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
 from kedro.runner import ParallelRunner, SequentialRunner
+from kedro_datasets.pandas import CSVDataSet, ParquetDataSet
+from kedro_datasets.pickle import PickleDataSet
+from kedro_datasets.spark import SparkDataSet
+from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils
 from moto import mock_s3
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import col
@@ -24,11 +28,6 @@
 )
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.pandas import CSVDataSet, ParquetDataSet
-from kedro_datasets.pickle import PickleDataSet
-from kedro_datasets.spark import SparkDataSet
-from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils
-
 FOLDER_NAME = "fake_folder"
 FILENAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py
index e0b8fc333..88c18aee6 100644
--- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py
@@ -5,13 +5,12 @@
 
 import pytest
 from kedro.io import DataSetError
+from kedro_datasets.spark import SparkHiveDataSet
 from psutil import Popen
 from pyspark import SparkContext
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
-from kedro_datasets.spark import SparkHiveDataSet
-
 TESTSPARKDIR = "test_spark_dir"
 
 
diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
index 0f3d0e66b..73e091ef9 100644
--- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
@@ -2,7 +2,6 @@
 
 import pytest
 from kedro.io import DataSetError
-
 from kedro_datasets.spark import SparkJDBCDataSet
 
 
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index f2fd3bb3d..fe59c5810 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,10 +1,11 @@
 import json
+
 import pytest
-from pyspark.sql import SparkSession
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from kedro.io.core import DataSetError
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
+from pyspark.sql import SparkSession
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
 
 def sample_schema(schema_path):
diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py
index 733cc6c1f..a4bee6896 100644
--- a/kedro-datasets/tests/text/test_text_dataset.py
+++ b/kedro-datasets/tests/text/test_text_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.text import TextDataSet
+from s3fs.core import S3FileSystem
 
 STRING = "Write to text file."
 
diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py
index 62172b1a4..2529868c4 100644
--- a/kedro-datasets/tests/tracking/test_json_dataset.py
+++ b/kedro-datasets/tests/tracking/test_json_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.tracking import JSONDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py
index 2c1157de9..ad9f4a1cb 100644
--- a/kedro-datasets/tests/tracking/test_metrics_dataset.py
+++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py
@@ -6,9 +6,8 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from s3fs.core import S3FileSystem
-
 from kedro_datasets.tracking import MetricsDataSet
+from s3fs.core import S3FileSystem
 
 
 @pytest.fixture
diff --git a/kedro-datasets/tests/video/conftest.py b/kedro-datasets/tests/video/conftest.py
index 7a0a4c87b..0dd5576dc 100644
--- a/kedro-datasets/tests/video/conftest.py
+++ b/kedro-datasets/tests/video/conftest.py
@@ -1,11 +1,10 @@
 from pathlib import Path
 
 import pytest
+from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
 from PIL import Image
 from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH
 
-from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
-
 
 @pytest.fixture(scope="module")
 def red_frame():
diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py
index 1ac3d1ce4..b4428c4df 100644
--- a/kedro-datasets/tests/video/test_video_dataset.py
+++ b/kedro-datasets/tests/video/test_video_dataset.py
@@ -1,11 +1,10 @@
 import boto3
 import pytest
 from kedro.io import DataSetError
-from moto import mock_s3
-from utils import TEST_FPS, assert_videos_equal
-
 from kedro_datasets.video import VideoDataSet
 from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo
+from moto import mock_s3
+from utils import TEST_FPS, assert_videos_equal
 
 S3_BUCKET_NAME = "test_bucket"
 S3_KEY_PATH = "video"
diff --git a/kedro-datasets/tests/video/test_video_objects.py b/kedro-datasets/tests/video/test_video_objects.py
index 1cb7cca75..3adb701d2 100644
--- a/kedro-datasets/tests/video/test_video_objects.py
+++ b/kedro-datasets/tests/video/test_video_objects.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
 from utils import (
     DEFAULT_FOURCC,
     MJPEG_FOURCC,
@@ -21,8 +22,6 @@
     assert_images_equal,
 )
 
-from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
-
 
 class TestSequenceVideo:
     def test_sequence_video_indexing_first(self, color_video, red_frame):
diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py
index 653606c17..2cadeee7d 100644
--- a/kedro-datasets/tests/yaml/test_yaml_dataset.py
+++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py
@@ -7,11 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
+from kedro_datasets.yaml import YAMLDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
-from kedro_datasets.yaml import YAMLDataSet
-
 
 @pytest.fixture
 def filepath_yaml(tmp_path):
diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py
index 0306c1e2f..2c680fd70 100644
--- a/kedro-docker/features/steps/cli_steps.py
+++ b/kedro-docker/features/steps/cli_steps.py
@@ -8,7 +8,6 @@
 import behave
 import yaml
 from behave import given, then, when
-
 from features.steps.sh_run import ChildTerminatingPopen, run
 from features.steps.util import (
     TimeoutException,
diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py
index 27af7db96..cc8dda1c4 100644
--- a/kedro-docker/kedro_docker/plugin.py
+++ b/kedro-docker/kedro_docker/plugin.py
@@ -125,9 +125,9 @@ def docker_init(spark):
     if KEDRO_VERSION.match(">=0.17.0"):
         verbose = KedroCliError.VERBOSE_ERROR
     else:
-        from kedro.framework.cli.cli import (  # noqa # pylint:disable=import-outside-toplevel, no-name-in-module
+        from kedro.framework.cli.cli import (
             _VERBOSE as verbose,
-        )
+        )  # noqa # pylint:disable=import-outside-toplevel, no-name-in-module
 
     docker_file_version = "spark" if spark else "simple"
     docker_file = f"Dockerfile.{docker_file_version}"
diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py
index 40b5d9306..f205c9efe 100644
--- a/kedro-docker/tests/test_helpers.py
+++ b/kedro-docker/tests/test_helpers.py
@@ -3,7 +3,6 @@
 
 import pytest
 from click import ClickException
-
 from kedro_docker.helpers import (
     add_jupyter_args,
     check_docker_image_exists,
diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py
index 5eeb4d489..1027d541d 100644
--- a/kedro-telemetry/kedro_telemetry/plugin.py
+++ b/kedro-telemetry/kedro_telemetry/plugin.py
@@ -22,7 +22,6 @@
 from kedro.framework.startup import ProjectMetadata
 from kedro.io.data_catalog import DataCatalog
 from kedro.pipeline import Pipeline
-
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
 from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli
 
diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py
index 74773e2f4..1e674096b 100644
--- a/kedro-telemetry/tests/test_masking.py
+++ b/kedro-telemetry/tests/test_masking.py
@@ -9,7 +9,6 @@
 from kedro import __version__ as kedro_version
 from kedro.framework.cli.cli import KedroCLI, cli
 from kedro.framework.startup import ProjectMetadata
-
 from kedro_telemetry.masking import (
     MASK,
     _get_cli_structure,
diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py
index 222bcc914..9b1a6460b 100644
--- a/kedro-telemetry/tests/test_plugin.py
+++ b/kedro-telemetry/tests/test_plugin.py
@@ -9,8 +9,6 @@
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
-from pytest import fixture
-
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
 from kedro_telemetry.plugin import (
     KedroTelemetryCLIHooks,
@@ -18,6 +16,7 @@
     _check_for_telemetry_consent,
     _confirm_consent,
 )
+from pytest import fixture
 
 REPO_NAME = "dummy_project"
 PACKAGE_NAME = "dummy_package"
diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py
index dd05d4c5a..e8f5d8449 100755
--- a/tools/circleci/circleci_release.py
+++ b/tools/circleci/circleci_release.py
@@ -8,7 +8,6 @@
 
 import requests
 from requests.structures import CaseInsensitiveDict
-
 from utils.check_no_version_pypi import check_no_version_pypi
 from utils.package_version import get_package_version
 

From d8d3bc281b54ec3382d8de954c26025491a7f4a2 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 1 May 2023 19:08:07 +0100
Subject: [PATCH 39/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index fe59c5810..82b90481c 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -2,11 +2,12 @@
 
 import pytest
 from kedro.io.core import DataSetError
-from kedro_datasets.spark.spark_dataset import SparkDataSet
-from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
+from kedro_datasets.spark.spark_dataset import SparkDataSet
+from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
+
 
 def sample_schema(schema_path):
     with open(schema_path, encoding="utf-8") as f:
@@ -29,7 +30,7 @@ def sample_spark_streaming_df(tmp_path):
     )
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
     schema_path = (tmp_path / "test.json").as_posix()
-    with open(schema_path, "w") as f:
+    with open(schema_path, "w", encoding="utf-8") as f:
         json.dump(schema.jsonValue(), f)
     return SparkSession.builder.getOrCreate().createDataFrame(data, schema)
 

From be4a3e5c3698a456f6c11d1b8041ea7ba2340298 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Mon, 1 May 2023 19:08:44 +0100
Subject: [PATCH 40/96] formatting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 203539a11..79a044c6d 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -13,13 +13,14 @@
     get_filepath_str,
     get_protocol_and_path,
 )
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark import SparkConf
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
 from pyspark.sql.utils import AnalysisException
 from yaml.loader import SafeLoader
 
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
+
 
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
@@ -47,6 +48,7 @@ class SparkStreamingDataSet(AbstractDataSet):
 
     """
 
+    # pylint: disable=too-many-instance-attributes
     DEFAULT_LOAD_ARGS = {}  # type: Dict[str, Any]
     DEFAULT_SAVE_ARGS = {}  # type: Dict[str, Any]
 
@@ -156,7 +158,8 @@ def _get_spark():
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.
-        If the connector type is kafka then no file_path is required, schema needs to be seperated from load_args
+        If the connector type is kafka then no file_path is required, schema needs to be
+        seperated from load_args.
 
         Returns:
             Data from filepath as pyspark dataframe.
@@ -211,8 +214,8 @@ def _exists(self, schema_path: str) -> bool:
             schema_path: schema of saved streaming dataframe
         """
         load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
-        with open(schema_path, encoding="utf-8") as f:
-            schema = StructType.fromJson(json.loads(f.read()))
+        with open(schema_path, encoding="utf-8") as schema_file:
+            schema = StructType.fromJson(json.loads(schema_file.read()))
         try:
             self._get_spark().readStream.schema(schema).load(
                 load_path, self._file_format

From e39c6397182d163aa13c6ee46be67679357dfcad Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 11:04:12 +0100
Subject: [PATCH 41/96] lint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md            | 2 +-
 .../kedro_datasets/spark/deltatable_dataset.py           | 3 ++-
 .../kedro_datasets/spark/spark_streaming_dataset.py      | 9 ---------
 kedro-datasets/kedro_datasets/tracking/json_dataset.py   | 1 +
 .../kedro_datasets/tracking/metrics_dataset.py           | 1 +
 kedro-datasets/tests/api/test_api_dataset.py             | 3 ++-
 .../tests/bioinformatics/test_biosequence_dataset.py     | 3 ++-
 kedro-datasets/tests/dask/test_parquet_dataset.py        | 3 ++-
 kedro-datasets/tests/email/test_message_dataset.py       | 3 ++-
 kedro-datasets/tests/geojson/test_geojson_dataset.py     | 3 ++-
 kedro-datasets/tests/holoviews/test_holoviews_writer.py  | 3 ++-
 kedro-datasets/tests/json/test_json_dataset.py           | 3 ++-
 kedro-datasets/tests/libsvm/test_svmlight_dataset.py     | 3 ++-
 .../tests/matplotlib/test_matplotlib_writer.py           | 3 ++-
 kedro-datasets/tests/networkx/test_gml_dataset.py        | 3 ++-
 kedro-datasets/tests/networkx/test_graphml_dataset.py    | 3 ++-
 kedro-datasets/tests/networkx/test_json_dataset.py       | 3 ++-
 kedro-datasets/tests/pandas/test_csv_dataset.py          | 3 ++-
 kedro-datasets/tests/pandas/test_excel_dataset.py        | 3 ++-
 kedro-datasets/tests/pandas/test_feather_dataset.py      | 3 ++-
 kedro-datasets/tests/pandas/test_gbq_dataset.py          | 3 ++-
 kedro-datasets/tests/pandas/test_generic_dataset.py      | 3 ++-
 kedro-datasets/tests/pandas/test_hdf_dataset.py          | 3 ++-
 kedro-datasets/tests/pandas/test_json_dataset.py         | 3 ++-
 kedro-datasets/tests/pandas/test_parquet_dataset.py      | 3 ++-
 kedro-datasets/tests/pandas/test_sql_dataset.py          | 1 +
 kedro-datasets/tests/pandas/test_xml_dataset.py          | 3 ++-
 kedro-datasets/tests/pickle/test_pickle_dataset.py       | 3 ++-
 kedro-datasets/tests/pillow/test_image_dataset.py        | 3 ++-
 kedro-datasets/tests/plotly/test_json_dataset.py         | 3 ++-
 kedro-datasets/tests/plotly/test_plotly_dataset.py       | 3 ++-
 kedro-datasets/tests/polars/test_csv_dataset.py          | 3 ++-
 kedro-datasets/tests/redis/test_redis_dataset.py         | 3 ++-
 kedro-datasets/tests/snowflake/test_snowpark_dataset.py  | 1 +
 kedro-datasets/tests/spark/test_deltatable_dataset.py    | 3 ++-
 kedro-datasets/tests/spark/test_spark_dataset.py         | 9 +++++----
 kedro-datasets/tests/spark/test_spark_hive_dataset.py    | 3 ++-
 kedro-datasets/tests/spark/test_spark_jdbc_dataset.py    | 1 +
 kedro-datasets/tests/text/test_text_dataset.py           | 3 ++-
 kedro-datasets/tests/tracking/test_json_dataset.py       | 3 ++-
 kedro-datasets/tests/tracking/test_metrics_dataset.py    | 3 ++-
 kedro-datasets/tests/video/conftest.py                   | 3 ++-
 kedro-datasets/tests/video/test_video_dataset.py         | 5 +++--
 kedro-datasets/tests/video/test_video_objects.py         | 3 ++-
 kedro-datasets/tests/yaml/test_yaml_dataset.py           | 3 ++-
 45 files changed, 86 insertions(+), 52 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index f222df00a..910289135 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -3,7 +3,7 @@
 ``SparkStreamingDataSet`` loads and saves data to streaming DataFrames.
 See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
 
-To work with multiple streaming nodes, 2 hook are required for: 
+To work with multiple streaming nodes, 2 hook are required for:
     - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details
     - Running streaming query without termination unless exception
 
diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
index 9454a47f7..34ee6f6a5 100644
--- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
@@ -6,10 +6,11 @@
 
 from delta.tables import DeltaTable
 from kedro.io.core import AbstractDataSet, DataSetError
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 from pyspark.sql import SparkSession
 from pyspark.sql.utils import AnalysisException
 
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
+
 
 class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]):
     """``DeltaTableDataSet`` loads data into DeltaTable objects.
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 79a044c6d..4d7695e4e 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -24,17 +24,14 @@
 
 class SparkStreamingDataSet(AbstractDataSet):
     """``SparkStreamingDataSet`` loads data into Spark Streaming Dataframe objects.
-
     Example usage for the
     `YAML API <https://kedro.readthedocs.io/en/stable/data/\
     data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
     .. code-block:: yaml
-
         raw.new_inventory:
             type: spark.SparkStreamingDataSet
             filepath: data/01_raw/stream/inventory/
             file_format: json
-
         int.new_inventory:
             type: spark.SparkStreamingDataSet
             filepath: data/02_intermediate/inventory/
@@ -45,7 +42,6 @@ class SparkStreamingDataSet(AbstractDataSet):
                 header: True
             load_args:
                 header: True
-
     """
 
     # pylint: disable=too-many-instance-attributes
@@ -60,7 +56,6 @@ def __init__(
         load_args: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of SparkStreamingDataSet.
-
         Args:
             filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks
                 specify ``filepath``s starting with ``/dbfs/``. For message brokers such as
@@ -160,7 +155,6 @@ def _load(self) -> DataFrame:
         """Loads data from filepath.
         If the connector type is kafka then no file_path is required, schema needs to be
         seperated from load_args.
-
         Returns:
             Data from filepath as pyspark dataframe.
         """
@@ -186,10 +180,8 @@ def _load(self) -> DataFrame:
 
     def _save(self, data: DataFrame) -> None:
         """Saves pyspark dataframe.
-
         Args:
             data: PySpark streaming dataframe for saving
-
         """
 
         output_constructor = data.writeStream.format(self._file_format)
@@ -209,7 +201,6 @@ def _save(self, data: DataFrame) -> None:
 
     def _exists(self, schema_path: str) -> bool:
         """Check the existence of pyspark dataframe.
-
         Args:
             schema_path: schema of saved streaming dataframe
         """
diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py
index 994236d3d..4235df999 100644
--- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py
@@ -5,6 +5,7 @@
 from typing import NoReturn
 
 from kedro.io.core import DataSetError
+
 from kedro_datasets.json import JSONDataSet as JDS
 
 
diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
index 2e4e2d970..7c7546a85 100644
--- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
+++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
@@ -7,6 +7,7 @@
 from typing import Dict, NoReturn
 
 from kedro.io.core import DataSetError, get_filepath_str
+
 from kedro_datasets.json import JSONDataSet
 
 
diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py
index 51279c71c..848020041 100644
--- a/kedro-datasets/tests/api/test_api_dataset.py
+++ b/kedro-datasets/tests/api/test_api_dataset.py
@@ -5,9 +5,10 @@
 import pytest
 import requests
 from kedro.io.core import DataSetError
-from kedro_datasets.api import APIDataSet
 from requests.auth import HTTPBasicAuth
 
+from kedro_datasets.api import APIDataSet
+
 POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
 
 TEST_URL = "http://example.com/api/test"
diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
index 42b3e252f..24666baaf 100644
--- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
+++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py
@@ -8,9 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.biosequence import BioSequenceDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.biosequence import BioSequenceDataSet
+
 LOAD_ARGS = {"format": "fasta"}
 SAVE_ARGS = {"format": "fasta"}
 
diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py
index 3824d6c0f..8475dbf47 100644
--- a/kedro-datasets/tests/dask/test_parquet_dataset.py
+++ b/kedro-datasets/tests/dask/test_parquet_dataset.py
@@ -5,11 +5,12 @@
 import pyarrow.parquet as pq
 import pytest
 from kedro.io import DataSetError
-from kedro_datasets.dask import ParquetDataSet
 from moto import mock_s3
 from pandas.testing import assert_frame_equal
 from s3fs import S3FileSystem
 
+from kedro_datasets.dask import ParquetDataSet
+
 FILE_NAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
 AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}
diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py
index 6f97b6c89..100daba52 100644
--- a/kedro-datasets/tests/email/test_message_dataset.py
+++ b/kedro-datasets/tests/email/test_message_dataset.py
@@ -8,9 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.email import EmailMessageDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.email import EmailMessageDataSet
+
 
 @pytest.fixture
 def filepath_message(tmp_path):
diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py
index cd6c07c7c..b5f3ec4cb 100644
--- a/kedro-datasets/tests/geojson/test_geojson_dataset.py
+++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py
@@ -7,11 +7,12 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
-from kedro_datasets.geopandas import GeoJSONDataSet
 from pandas.testing import assert_frame_equal
 from s3fs import S3FileSystem
 from shapely.geometry import Point
 
+from kedro_datasets.geopandas import GeoJSONDataSet
+
 
 @pytest.fixture(params=[None])
 def load_version(request):
diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py
index 53ca795f2..f4f91383e 100644
--- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py
+++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py
@@ -9,9 +9,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.holoviews import HoloviewsWriter
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.holoviews import HoloviewsWriter
+
 
 @pytest.fixture
 def filepath_png(tmp_path):
diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py
index dafdd8e3e..621e51fcd 100644
--- a/kedro-datasets/tests/json/test_json_dataset.py
+++ b/kedro-datasets/tests/json/test_json_dataset.py
@@ -6,9 +6,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.json import JSONDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.json import JSONDataSet
+
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
index 9fcf09c0c..8fff3edd2 100644
--- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
+++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py
@@ -7,9 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.svmlight import SVMLightDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.svmlight import SVMLightDataSet
+
 
 @pytest.fixture
 def filepath_svm(tmp_path):
diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
index ed4dec348..4086e127e 100644
--- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
+++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py
@@ -6,10 +6,11 @@
 import matplotlib.pyplot as plt
 import pytest
 from kedro.io import DataSetError, Version
-from kedro_datasets.matplotlib import MatplotlibWriter
 from moto import mock_s3
 from s3fs import S3FileSystem
 
+from kedro_datasets.matplotlib import MatplotlibWriter
+
 BUCKET_NAME = "test_bucket"
 AWS_CREDENTIALS = {"key": "testing", "secret": "testing"}
 KEY_PATH = "matplotlib"
diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py
index dd589019d..a3a89eca7 100644
--- a/kedro-datasets/tests/networkx/test_gml_dataset.py
+++ b/kedro-datasets/tests/networkx/test_gml_dataset.py
@@ -7,9 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.networkx import GMLDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.networkx import GMLDataSet
+
 ATTRS = {
     "source": "from",
     "target": "to",
diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py
index 9ff22883e..4e0dcf40d 100644
--- a/kedro-datasets/tests/networkx/test_graphml_dataset.py
+++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py
@@ -7,9 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.networkx import GraphMLDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.networkx import GraphMLDataSet
+
 ATTRS = {
     "source": "from",
     "target": "to",
diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py
index ed437f69a..4d6e582a8 100644
--- a/kedro-datasets/tests/networkx/test_json_dataset.py
+++ b/kedro-datasets/tests/networkx/test_json_dataset.py
@@ -7,9 +7,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.networkx import JSONDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.networkx import JSONDataSet
+
 ATTRS = {
     "source": "from",
     "target": "to",
diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py
index 53a1e7c52..5cc1ee36b 100644
--- a/kedro-datasets/tests/pandas/test_csv_dataset.py
+++ b/kedro-datasets/tests/pandas/test_csv_dataset.py
@@ -12,11 +12,12 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
-from kedro_datasets.pandas import CSVDataSet
 from moto import mock_s3
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import CSVDataSet
+
 BUCKET_NAME = "test_bucket"
 FILE_NAME = "test.csv"
 
diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py
index bae8c5147..1080cc9b6 100644
--- a/kedro-datasets/tests/pandas/test_excel_dataset.py
+++ b/kedro-datasets/tests/pandas/test_excel_dataset.py
@@ -7,10 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pandas import ExcelDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import ExcelDataSet
+
 
 @pytest.fixture
 def filepath_excel(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py
index ec995d657..80c1ce678 100644
--- a/kedro-datasets/tests/pandas/test_feather_dataset.py
+++ b/kedro-datasets/tests/pandas/test_feather_dataset.py
@@ -7,10 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pandas import FeatherDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import FeatherDataSet
+
 
 @pytest.fixture
 def filepath_feather(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py
index d970db36e..e239dbaba 100644
--- a/kedro-datasets/tests/pandas/test_gbq_dataset.py
+++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py
@@ -4,9 +4,10 @@
 import pytest
 from google.cloud.exceptions import NotFound
 from kedro.io.core import DataSetError
-from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet
 from pandas.testing import assert_frame_equal
 
+from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet
+
 DATASET = "dataset"
 TABLE_NAME = "table_name"
 PROJECT = "project"
diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py
index 2526c1ed6..6f40bb0d4 100644
--- a/kedro-datasets/tests/pandas/test_generic_dataset.py
+++ b/kedro-datasets/tests/pandas/test_generic_dataset.py
@@ -9,10 +9,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError, Version
 from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp
-from kedro_datasets.pandas import GenericDataSet
 from pandas._testing import assert_frame_equal
 from s3fs import S3FileSystem
 
+from kedro_datasets.pandas import GenericDataSet
+
 
 @pytest.fixture
 def filepath_sas(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py
index c59e7a104..563ba63d9 100644
--- a/kedro-datasets/tests/pandas/test_hdf_dataset.py
+++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py
@@ -7,10 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pandas import HDFDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import HDFDataSet
+
 HDF_KEY = "data"
 
 
diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py
index 7da50165e..df2e856d5 100644
--- a/kedro-datasets/tests/pandas/test_json_dataset.py
+++ b/kedro-datasets/tests/pandas/test_json_dataset.py
@@ -8,10 +8,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pandas import JSONDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import JSONDataSet
+
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py
index cc62ed203..2d7ce2996 100644
--- a/kedro-datasets/tests/pandas/test_parquet_dataset.py
+++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py
@@ -7,11 +7,12 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pandas import ParquetDataSet
 from pandas.testing import assert_frame_equal
 from pyarrow.fs import FSSpecHandler, PyFileSystem
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import ParquetDataSet
+
 FILENAME = "test.parquet"
 
 
diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py
index b810748c2..308582859 100644
--- a/kedro-datasets/tests/pandas/test_sql_dataset.py
+++ b/kedro-datasets/tests/pandas/test_sql_dataset.py
@@ -6,6 +6,7 @@
 import pytest
 import sqlalchemy
 from kedro.io import DataSetError
+
 from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet
 
 TABLE_NAME = "table_a"
diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py
index 65be88174..bd62ea586 100644
--- a/kedro-datasets/tests/pandas/test_xml_dataset.py
+++ b/kedro-datasets/tests/pandas/test_xml_dataset.py
@@ -8,10 +8,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pandas import XMLDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pandas import XMLDataSet
+
 
 @pytest.fixture
 def filepath_xml(tmp_path):
diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py
index 2846201cf..fb95681a3 100644
--- a/kedro-datasets/tests/pickle/test_pickle_dataset.py
+++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py
@@ -8,10 +8,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.pickle import PickleDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pickle import PickleDataSet
+
 
 @pytest.fixture
 def filepath_pickle(tmp_path):
diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py
index ed27e3cb9..ea500b20d 100644
--- a/kedro-datasets/tests/pillow/test_image_dataset.py
+++ b/kedro-datasets/tests/pillow/test_image_dataset.py
@@ -6,10 +6,11 @@
 from fsspec.implementations.local import LocalFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
-from kedro_datasets.pillow import ImageDataSet
 from PIL import Image, ImageChops
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.pillow import ImageDataSet
+
 
 @pytest.fixture
 def filepath_png(tmp_path):
diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py
index 0115a72dd..ab6e17d9c 100644
--- a/kedro-datasets/tests/plotly/test_json_dataset.py
+++ b/kedro-datasets/tests/plotly/test_json_dataset.py
@@ -8,9 +8,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.plotly import JSONDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.plotly import JSONDataSet
+
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py
index 9b33492bf..a422060e8 100644
--- a/kedro-datasets/tests/plotly/test_plotly_dataset.py
+++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py
@@ -8,11 +8,12 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER
-from kedro_datasets.plotly import PlotlyDataSet
 from plotly import graph_objects
 from plotly.graph_objs import Scatter
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.plotly import PlotlyDataSet
+
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py
index 4c0807d91..d79183539 100644
--- a/kedro-datasets/tests/polars/test_csv_dataset.py
+++ b/kedro-datasets/tests/polars/test_csv_dataset.py
@@ -12,11 +12,12 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp
-from kedro_datasets.polars import CSVDataSet
 from moto import mock_s3
 from polars.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.polars import CSVDataSet
+
 BUCKET_NAME = "test_bucket"
 FILE_NAME = "test.csv"
 
diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py
index ddda22c17..eaa8abbd2 100644
--- a/kedro-datasets/tests/redis/test_redis_dataset.py
+++ b/kedro-datasets/tests/redis/test_redis_dataset.py
@@ -8,9 +8,10 @@
 import pytest
 import redis
 from kedro.io import DataSetError
-from kedro_datasets.redis import PickleDataSet
 from pandas.testing import assert_frame_equal
 
+from kedro_datasets.redis import PickleDataSet
+
 
 @pytest.fixture(params=["pickle"])
 def backend(request):
diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
index d73731df2..2133953b5 100644
--- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
+++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py
@@ -6,6 +6,7 @@
 
 try:
     import snowflake.snowpark as sp
+
     from kedro_datasets.snowflake import SnowparkTableDataSet as spds
 except ImportError:
     pass  # this is only for test discovery to succeed on Python <> 3.8
diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py
index 430c78ea2..5cbbe62b7 100644
--- a/kedro-datasets/tests/spark/test_deltatable_dataset.py
+++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py
@@ -4,11 +4,12 @@
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
 from kedro.runner import ParallelRunner
-from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from pyspark.sql.utils import AnalysisException
 
+from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet
+
 
 @pytest.fixture
 def sample_spark_df():
diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py
index 9a3e58035..9452b007d 100644
--- a/kedro-datasets/tests/spark/test_spark_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_dataset.py
@@ -12,10 +12,6 @@
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
 from kedro.runner import ParallelRunner, SequentialRunner
-from kedro_datasets.pandas import CSVDataSet, ParquetDataSet
-from kedro_datasets.pickle import PickleDataSet
-from kedro_datasets.spark import SparkDataSet
-from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils
 from moto import mock_s3
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import col
@@ -28,6 +24,11 @@
 )
 from pyspark.sql.utils import AnalysisException
 
+from kedro_datasets.pandas import CSVDataSet, ParquetDataSet
+from kedro_datasets.pickle import PickleDataSet
+from kedro_datasets.spark import SparkDataSet
+from kedro_datasets.spark.spark_dataset import _dbfs_exists, _dbfs_glob, _get_dbutils
+
 FOLDER_NAME = "fake_folder"
 FILENAME = "test.parquet"
 BUCKET_NAME = "test_bucket"
diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py
index 88c18aee6..e0b8fc333 100644
--- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py
@@ -5,12 +5,13 @@
 
 import pytest
 from kedro.io import DataSetError
-from kedro_datasets.spark import SparkHiveDataSet
 from psutil import Popen
 from pyspark import SparkContext
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
+from kedro_datasets.spark import SparkHiveDataSet
+
 TESTSPARKDIR = "test_spark_dir"
 
 
diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
index 73e091ef9..0f3d0e66b 100644
--- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py
@@ -2,6 +2,7 @@
 
 import pytest
 from kedro.io import DataSetError
+
 from kedro_datasets.spark import SparkJDBCDataSet
 
 
diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py
index a4bee6896..733cc6c1f 100644
--- a/kedro-datasets/tests/text/test_text_dataset.py
+++ b/kedro-datasets/tests/text/test_text_dataset.py
@@ -6,9 +6,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.text import TextDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.text import TextDataSet
+
 STRING = "Write to text file."
 
 
diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py
index 2529868c4..62172b1a4 100644
--- a/kedro-datasets/tests/tracking/test_json_dataset.py
+++ b/kedro-datasets/tests/tracking/test_json_dataset.py
@@ -6,9 +6,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.tracking import JSONDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.tracking import JSONDataSet
+
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py
index ad9f4a1cb..2c1157de9 100644
--- a/kedro-datasets/tests/tracking/test_metrics_dataset.py
+++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py
@@ -6,9 +6,10 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.tracking import MetricsDataSet
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.tracking import MetricsDataSet
+
 
 @pytest.fixture
 def filepath_json(tmp_path):
diff --git a/kedro-datasets/tests/video/conftest.py b/kedro-datasets/tests/video/conftest.py
index 0dd5576dc..7a0a4c87b 100644
--- a/kedro-datasets/tests/video/conftest.py
+++ b/kedro-datasets/tests/video/conftest.py
@@ -1,10 +1,11 @@
 from pathlib import Path
 
 import pytest
-from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
 from PIL import Image
 from utils import TEST_FPS, TEST_HEIGHT, TEST_WIDTH
 
+from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
+
 
 @pytest.fixture(scope="module")
 def red_frame():
diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py
index b4428c4df..1ac3d1ce4 100644
--- a/kedro-datasets/tests/video/test_video_dataset.py
+++ b/kedro-datasets/tests/video/test_video_dataset.py
@@ -1,11 +1,12 @@
 import boto3
 import pytest
 from kedro.io import DataSetError
-from kedro_datasets.video import VideoDataSet
-from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo
 from moto import mock_s3
 from utils import TEST_FPS, assert_videos_equal
 
+from kedro_datasets.video import VideoDataSet
+from kedro_datasets.video.video_dataset import FileVideo, SequenceVideo
+
 S3_BUCKET_NAME = "test_bucket"
 S3_KEY_PATH = "video"
 S3_FULL_PATH = f"s3://{S3_BUCKET_NAME}/{S3_KEY_PATH}/"
diff --git a/kedro-datasets/tests/video/test_video_objects.py b/kedro-datasets/tests/video/test_video_objects.py
index 3adb701d2..1cb7cca75 100644
--- a/kedro-datasets/tests/video/test_video_objects.py
+++ b/kedro-datasets/tests/video/test_video_objects.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
 from utils import (
     DEFAULT_FOURCC,
     MJPEG_FOURCC,
@@ -22,6 +21,8 @@
     assert_images_equal,
 )
 
+from kedro_datasets.video.video_dataset import FileVideo, GeneratorVideo, SequenceVideo
+
 
 class TestSequenceVideo:
     def test_sequence_video_indexing_first(self, color_video, red_frame):
diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py
index 2cadeee7d..653606c17 100644
--- a/kedro-datasets/tests/yaml/test_yaml_dataset.py
+++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py
@@ -7,10 +7,11 @@
 from gcsfs import GCSFileSystem
 from kedro.io import DataSetError
 from kedro.io.core import PROTOCOL_DELIMITER, Version
-from kedro_datasets.yaml import YAMLDataSet
 from pandas.testing import assert_frame_equal
 from s3fs.core import S3FileSystem
 
+from kedro_datasets.yaml import YAMLDataSet
+
 
 @pytest.fixture
 def filepath_yaml(tmp_path):

From 66440f4094ea48b0ac6119fb0b284d559e4ad685 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 11:07:58 +0100
Subject: [PATCH 42/96] lint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py           | 2 +-
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py    | 2 +-
 kedro-docker/features/steps/cli_steps.py                      | 1 +
 kedro-docker/kedro_docker/plugin.py                           | 4 ++--
 kedro-docker/tests/test_helpers.py                            | 1 +
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 4d7695e4e..d68db8745 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -199,7 +199,7 @@ def _save(self, data: DataFrame) -> None:
             .start()
         )
 
-    def _exists(self, schema_path: str) -> bool:
+    def custom_exists(self, schema_path: str) -> bool:
         """Check the existence of pyspark dataframe.
         Args:
             schema_path: schema of saved streaming dataframe
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 82b90481c..d782961a2 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -80,4 +80,4 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
         assert not streaming_ds._exists(schema_path)
 
         streaming_ds.save(loaded_with_streaming)
-        assert streaming_ds._exists(schema_path)
+        assert streaming_ds.custom_exists(schema_path)
diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py
index 2c680fd70..0306c1e2f 100644
--- a/kedro-docker/features/steps/cli_steps.py
+++ b/kedro-docker/features/steps/cli_steps.py
@@ -8,6 +8,7 @@
 import behave
 import yaml
 from behave import given, then, when
+
 from features.steps.sh_run import ChildTerminatingPopen, run
 from features.steps.util import (
     TimeoutException,
diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py
index cc8dda1c4..27af7db96 100644
--- a/kedro-docker/kedro_docker/plugin.py
+++ b/kedro-docker/kedro_docker/plugin.py
@@ -125,9 +125,9 @@ def docker_init(spark):
     if KEDRO_VERSION.match(">=0.17.0"):
         verbose = KedroCliError.VERBOSE_ERROR
     else:
-        from kedro.framework.cli.cli import (
+        from kedro.framework.cli.cli import (  # noqa # pylint:disable=import-outside-toplevel, no-name-in-module
             _VERBOSE as verbose,
-        )  # noqa # pylint:disable=import-outside-toplevel, no-name-in-module
+        )
 
     docker_file_version = "spark" if spark else "simple"
     docker_file = f"Dockerfile.{docker_file_version}"
diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py
index f205c9efe..40b5d9306 100644
--- a/kedro-docker/tests/test_helpers.py
+++ b/kedro-docker/tests/test_helpers.py
@@ -3,6 +3,7 @@
 
 import pytest
 from click import ClickException
+
 from kedro_docker.helpers import (
     add_jupyter_args,
     check_docker_image_exists,

From 0ed5b90a8b253860919ac27a4d2eeff73b01d4e3 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 11:20:37 +0100
Subject: [PATCH 43/96] lint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index d782961a2..a859b7639 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -77,7 +77,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
             file_format="json",
             save_args={"checkpoint": checkpoint_path, "output_mode": "append"},
         )
-        assert not streaming_ds._exists(schema_path)
+        assert not streaming_ds.custom_exists(schema_path)
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.custom_exists(schema_path)

From 04c623bdfabf9ba4f1ba7b7d022566565d8745a0 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 13:31:49 +0100
Subject: [PATCH 44/96] update test cases

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py        | 7 +++----
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 5 +++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index d68db8745..54b407d84 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -199,16 +199,15 @@ def _save(self, data: DataFrame) -> None:
             .start()
         )
 
-    def custom_exists(self, schema_path: str) -> bool:
+    def _exists(self) -> bool:
         """Check the existence of pyspark dataframe.
         Args:
             schema_path: schema of saved streaming dataframe
         """
         load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
-        with open(schema_path, encoding="utf-8") as schema_file:
-            schema = StructType.fromJson(json.loads(schema_file.read()))
+
         try:
-            self._get_spark().readStream.schema(schema).load(
+            self._get_spark().readStream.schema(self._schema).load(
                 load_path, self._file_format
             )
         except AnalysisException as exception:
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index a859b7639..1794fd54a 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -75,9 +75,10 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
         streaming_ds = SparkStreamingDataSet(
             filepath=filepath_output,
             file_format="json",
+            load_args={"schema": {"filepath": schema_path}},
             save_args={"checkpoint": checkpoint_path, "output_mode": "append"},
         )
-        assert not streaming_ds.custom_exists(schema_path)
+        assert not streaming_ds.exists()
 
         streaming_ds.save(loaded_with_streaming)
-        assert streaming_ds.custom_exists(schema_path)
+        assert streaming_ds.exists()

From a76f944d5ba9ead0fd7dd0e9a74694fe71f56d24 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 14:26:50 +0100
Subject: [PATCH 45/96] add negative test

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/test_spark_streaming_dataset.py     | 54 +++++++++++++++++--
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 1794fd54a..d3c72968d 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,13 +1,16 @@
+import re
 import json
-
+from pathlib import Path
 import pytest
 from kedro.io.core import DataSetError
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
+from pyspark.sql.utils import AnalysisException
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 
+SCHEMA_FILE_NAME = "schema.json"
 
 def sample_schema(schema_path):
     with open(schema_path, encoding="utf-8") as f:
@@ -29,7 +32,7 @@ def sample_spark_streaming_df(tmp_path):
         ]
     )
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
-    schema_path = (tmp_path / "test.json").as_posix()
+    schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
     with open(schema_path, "w", encoding="utf-8") as f:
         json.dump(schema.jsonValue(), f)
     return SparkSession.builder.getOrCreate().createDataFrame(data, schema)
@@ -38,7 +41,7 @@ def sample_spark_streaming_df(tmp_path):
 class TestStreamingDataSet:
     def test_load(self, tmp_path, sample_spark_streaming_df):
         filepath = (tmp_path / "test_streams").as_posix()
-        schema_path = (tmp_path / "test.json").as_posix()
+        schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
 
         spark_json_ds = SparkDataSet(
             filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}]
@@ -57,7 +60,7 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
     def test_save(self, tmp_path, sample_spark_streaming_df):
         filepath_json = (tmp_path / "test_streams").as_posix()
         filepath_output = (tmp_path / "test_streams_output").as_posix()
-        schema_path = (tmp_path / "test.json").as_posix()
+        schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
         checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
         spark_json_ds = SparkDataSet(
@@ -82,3 +85,46 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.exists()
+    def test_load_options_invalid_schema_file(self, tmp_path):
+        filepath = (tmp_path / "data").as_posix()
+        schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix()
+        Path(schemapath).write_text("dummy", encoding="utf-8")
+
+        pattern = (
+            f"Contents of 'schema.filepath' ({schemapath}) are invalid. Please"
+            f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'."
+        )
+
+        with pytest.raises(DataSetError, match=re.escape(pattern)):
+            SparkStreamingDataSet(
+                filepath=filepath,
+                file_format="csv",
+                load_args={"header": True, "schema": {"filepath": schemapath}},
+            )
+
+    def test_load_options_invalid_schema(self, tmp_path):
+        filepath = (tmp_path / "data").as_posix()
+
+        pattern = (
+            "Schema load argument does not specify a 'filepath' attribute. Please"
+            "include a path to a JSON-serialised 'pyspark.sql.types.StructType'."
+        )
+
+        with pytest.raises(DataSetError, match=pattern):
+            SparkStreamingDataSet(
+                filepath=filepath,
+                file_format="csv",
+                load_args={"header": True, "schema": {}},
+            )
+    def test_exists_raises_error(self, mocker):
+        # exists should raise all errors except for
+        # AnalysisExceptions clearly indicating a missing file
+        spark_data_set = SparkStreamingDataSet(filepath="")
+        mocker.patch.object(
+            spark_data_set,
+            "_get_spark",
+            side_effect=AnalysisException("Other Exception", []),
+        )
+
+        with pytest.raises(DataSetError, match="Other Exception"):
+            spark_data_set.exists()
\ No newline at end of file

From 30b002dd5ffdd6825f834277cd8ca153ac899cb0 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 14:44:50 +0100
Subject: [PATCH 46/96] remove code snippets fpr testing

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 31 +++++--------------
 .../spark/test_spark_streaming_dataset.py     | 10 ++++--
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 54b407d84..98b9cff71 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -139,16 +139,7 @@ def _describe(self) -> Dict[str, Any]:
 
     @staticmethod
     def _get_spark():
-        spark_conf_path = "conf/base/spark.yml"
-        if os.path.exists(spark_conf_path):
-            with open(
-                spark_conf_path, encoding="utf-8"
-            ) as File:  # pylint: disable=invalid-name
-                parameters = yaml.load(File, Loader=SafeLoader)
-            spark_conf = SparkConf().setAll(parameters.items())
-            spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
-        else:
-            spark = SparkSession.builder.getOrCreate()
+        spark = SparkSession.builder.getOrCreate()
         return spark
 
     def _load(self) -> DataFrame:
@@ -158,19 +149,13 @@ def _load(self) -> DataFrame:
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        if self._schema:
-            input_constructor = (
-                self._get_spark()
-                .readStream.schema(self._schema)
-                .format(self._file_format)
-                .options(**self._load_args)
-            )
-        else:
-            input_constructor = (
-                self._get_spark()
-                .readStream.format(self._file_format)
-                .options(**self._load_args)
-            )
+        input_constructor = (
+            self._get_spark()
+            .readStream
+            .schema(self._schema)
+            .format(self._file_format)
+            .options(**self._load_args)
+        )
         return (
             input_constructor.load()
             if self._file_format
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index d3c72968d..5f16dd2f3 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,17 +1,19 @@
-import re
 import json
+import re
 from pathlib import Path
+
 import pytest
 from kedro.io.core import DataSetError
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-
 from pyspark.sql.utils import AnalysisException
+
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 
 SCHEMA_FILE_NAME = "schema.json"
 
+
 def sample_schema(schema_path):
     with open(schema_path, encoding="utf-8") as f:
         try:
@@ -85,6 +87,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.exists()
+
     def test_load_options_invalid_schema_file(self, tmp_path):
         filepath = (tmp_path / "data").as_posix()
         schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix()
@@ -116,6 +119,7 @@ def test_load_options_invalid_schema(self, tmp_path):
                 file_format="csv",
                 load_args={"header": True, "schema": {}},
             )
+
     def test_exists_raises_error(self, mocker):
         # exists should raise all errors except for
         # AnalysisExceptions clearly indicating a missing file
@@ -127,4 +131,4 @@ def test_exists_raises_error(self, mocker):
         )
 
         with pytest.raises(DataSetError, match="Other Exception"):
-            spark_data_set.exists()
\ No newline at end of file
+            spark_data_set.exists()

From 9bef3a2116b17095147c8d8e416e0a20518825c8 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 15:06:29 +0100
Subject: [PATCH 47/96] lint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py        | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 98b9cff71..b80a4d6d4 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -1,23 +1,19 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
 import json
-import os
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any, Dict
 
 import fsspec
-import yaml
 from kedro.io.core import (
     AbstractDataSet,
     DataSetError,
     get_filepath_str,
     get_protocol_and_path,
 )
-from pyspark import SparkConf
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
 from pyspark.sql.utils import AnalysisException
-from yaml.loader import SafeLoader
 
 from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
 
@@ -151,8 +147,7 @@ def _load(self) -> DataFrame:
         """
         input_constructor = (
             self._get_spark()
-            .readStream
-            .schema(self._schema)
+            .readStream.schema(self._schema)
             .format(self._file_format)
             .options(**self._load_args)
         )

From 0bb5fe1968bd7dce8f707347e1c3777aabbac0ff Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Tue, 2 May 2023 17:13:53 +0100
Subject: [PATCH 48/96] update tests

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py           | 4 ----
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py    | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index b80a4d6d4..5c617809d 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -180,10 +180,6 @@ def _save(self, data: DataFrame) -> None:
         )
 
     def _exists(self) -> bool:
-        """Check the existence of pyspark dataframe.
-        Args:
-            schema_path: schema of saved streaming dataframe
-        """
         load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
 
         try:
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 5f16dd2f3..5c606b676 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -65,18 +65,22 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
         schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
         checkpoint_path = (tmp_path / "checkpoint").as_posix()
 
+        # Save the sample json file to temp_path for creating dataframe
         spark_json_ds = SparkDataSet(
             filepath=filepath_json,
             file_format="json",
             save_args=[{"mode", "overwrite"}],
         )
         spark_json_ds.save(sample_spark_streaming_df)
+
+        # Load the json file as the streaming dataframe
         loaded_with_streaming = SparkStreamingDataSet(
             filepath=filepath_json,
             file_format="json",
             load_args={"schema": {"filepath": schema_path}},
         ).load()
 
+        # Append json streams to filepath_output with specified schema path
         streaming_ds = SparkStreamingDataSet(
             filepath=filepath_output,
             file_format="json",

From e0ebe2741543c7281bfd354316b5f5558f383df6 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 4 May 2023 11:23:39 +0100
Subject: [PATCH 49/96] update test and remove redundacy

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 34 ++-----------
 .../spark/test_spark_streaming_dataset.py     | 48 -------------------
 2 files changed, 3 insertions(+), 79 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 5c617809d..09f01294c 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -1,21 +1,17 @@
 """SparkStreamingDataSet to load and save a PySpark Streaming DataFrame."""
-import json
 from copy import deepcopy
 from pathlib import PurePosixPath
 from typing import Any, Dict
 
-import fsspec
 from kedro.io.core import (
     AbstractDataSet,
-    DataSetError,
-    get_filepath_str,
-    get_protocol_and_path,
 )
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.types import StructType
 from pyspark.sql.utils import AnalysisException
 
 from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
+from kedro_datasets.spark.spark_dataset import SparkDataSet
+
 
 
 class SparkStreamingDataSet(AbstractDataSet):
@@ -97,32 +93,8 @@ def __init__(
         self._schema = self._load_args.pop("schema", None)
         if self._schema is not None:
             if isinstance(self._schema, dict):
-                self._schema = self._load_schema_from_file(self._schema)
-
-    @staticmethod
-    def _load_schema_from_file(schema: Dict[str, Any]) -> StructType:
-        filepath = schema.get("filepath")
-        if not filepath:
-            raise DataSetError(
-                "Schema load argument does not specify a 'filepath' attribute. Please"
-                "include a path to a JSON-serialised 'pyspark.sql.types.StructType'."
-            )
+                self._schema = SparkDataSet._load_schema_from_file(self._schema)
 
-        credentials = deepcopy(schema.get("credentials")) or {}
-        protocol, schema_path = get_protocol_and_path(filepath)
-        file_system = fsspec.filesystem(protocol, **credentials)
-        pure_posix_path = PurePosixPath(schema_path)
-        load_path = get_filepath_str(pure_posix_path, protocol)
-
-        # Open schema file
-        with file_system.open(load_path, encoding="utf-8") as fs_file:
-            try:
-                return StructType.fromJson(json.loads(fs_file.read()))
-            except Exception as exc:
-                raise DataSetError(
-                    f"Contents of 'schema.filepath' ({schema_path}) are invalid. Please"
-                    f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'."
-                ) from exc
 
     def _describe(self) -> Dict[str, Any]:
         """Returns a dict that describes attributes of the dataset."""
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 5c606b676..d8b9a1c77 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,12 +1,9 @@
 import json
-import re
-from pathlib import Path
 
 import pytest
 from kedro.io.core import DataSetError
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-from pyspark.sql.utils import AnalysisException
 
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
@@ -91,48 +88,3 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.exists()
-
-    def test_load_options_invalid_schema_file(self, tmp_path):
-        filepath = (tmp_path / "data").as_posix()
-        schemapath = (tmp_path / SCHEMA_FILE_NAME).as_posix()
-        Path(schemapath).write_text("dummy", encoding="utf-8")
-
-        pattern = (
-            f"Contents of 'schema.filepath' ({schemapath}) are invalid. Please"
-            f"provide a valid JSON-serialised 'pyspark.sql.types.StructType'."
-        )
-
-        with pytest.raises(DataSetError, match=re.escape(pattern)):
-            SparkStreamingDataSet(
-                filepath=filepath,
-                file_format="csv",
-                load_args={"header": True, "schema": {"filepath": schemapath}},
-            )
-
-    def test_load_options_invalid_schema(self, tmp_path):
-        filepath = (tmp_path / "data").as_posix()
-
-        pattern = (
-            "Schema load argument does not specify a 'filepath' attribute. Please"
-            "include a path to a JSON-serialised 'pyspark.sql.types.StructType'."
-        )
-
-        with pytest.raises(DataSetError, match=pattern):
-            SparkStreamingDataSet(
-                filepath=filepath,
-                file_format="csv",
-                load_args={"header": True, "schema": {}},
-            )
-
-    def test_exists_raises_error(self, mocker):
-        # exists should raise all errors except for
-        # AnalysisExceptions clearly indicating a missing file
-        spark_data_set = SparkStreamingDataSet(filepath="")
-        mocker.patch.object(
-            spark_data_set,
-            "_get_spark",
-            side_effect=AnalysisException("Other Exception", []),
-        )
-
-        with pytest.raises(DataSetError, match="Other Exception"):
-            spark_data_set.exists()

From 5bb5766c9425d2019f2a28ecb00ca775927b4752 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 4 May 2023 11:32:28 +0100
Subject: [PATCH 50/96] linting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 09f01294c..84cc17d23 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -3,15 +3,15 @@
 from pathlib import PurePosixPath
 from typing import Any, Dict
 
-from kedro.io.core import (
-    AbstractDataSet,
-)
+from kedro.io.core import AbstractDataSet
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
-from kedro_datasets.spark.spark_dataset import SparkDataSet
-
+from kedro_datasets.spark.spark_dataset import (
+    SparkDataSet,
+    _split_filepath,
+    _strip_dbfs_prefix,
+)
 
 
 class SparkStreamingDataSet(AbstractDataSet):
@@ -95,7 +95,6 @@ def __init__(
             if isinstance(self._schema, dict):
                 self._schema = SparkDataSet._load_schema_from_file(self._schema)
 
-
     def _describe(self) -> Dict[str, Any]:
         """Returns a dict that describes attributes of the dataset."""
         return {

From 20757812592fcd2af0f063b643ce35e87c779e0f Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 12:08:38 +0100
Subject: [PATCH 51/96] refactor file format

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md       | 13 +++++++++----
 .../kedro_datasets/spark/spark_streaming_dataset.py | 13 ++-----------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index 910289135..82ca7a041 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -7,6 +7,15 @@ To work with multiple streaming nodes, 2 hook are required for:
     - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details
     - Running streaming query without termination unless exception
 
+#### Supported file formats
+
+Supported file formats are:
+- Text
+- CSV
+- JSON
+- ORC
+- Parquet
+
 #### Example SparkStreamsHook:
 
 ```python
@@ -30,9 +39,5 @@ To make the application work with kafka format, respective spark configuration n
 ```yaml
 spark.driver.maxResultSize: 3g
 spark.scheduler.mode: FAIR
-spark.sql.streaming.schemaInference: True
-spark.streaming.stopGracefullyOnShutdown: true # graceful shutdown guarantees (under some conditions, listed below in the post) that all received data is processed before destroying Spark context
-spark.sql.streaming.stateStore.stateSchemaCheck: false # since schema is not mentioned explicitly
-spark.jars.packages: org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 # spark and kafka configuraton for reading kafka files (not required if kafka is not used)
 
 ```
diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 84cc17d23..8c6fa21f4 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -74,7 +74,6 @@ def __init__(
         self._file_format = file_format
         self._save_args = save_args
         self._load_args = load_args
-        self.output_format = ["kafka"]
 
         fs_prefix, filepath = _split_filepath(filepath)
 
@@ -122,12 +121,7 @@ def _load(self) -> DataFrame:
             .format(self._file_format)
             .options(**self._load_args)
         )
-        return (
-            input_constructor.load()
-            if self._file_format
-            in self.output_format  # if the connector type is message broker
-            else input_constructor.load(self._filepath_)
-        )
+        return input_constructor.load(self._filepath_)
 
     def _save(self, data: DataFrame) -> None:
         """Saves pyspark dataframe.
@@ -137,14 +131,11 @@ def _save(self, data: DataFrame) -> None:
 
         output_constructor = data.writeStream.format(self._file_format)
 
-        # for message brokers path is not needed
-        if self._file_format not in self.output_format:
-            output_constructor = output_constructor.option("path", self._filepath_)
-
         (
             output_constructor.option(
                 "checkpointLocation", self._save_args.pop("checkpoint")
             )
+            .option("path", self._filepath_)
             .outputMode(self._save_args.pop("output_mode"))
             .options(**self._save_args)
             .start()

From e8ea0d37a8f0a5e7b248c241d1a18b6cbed45631 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 14:58:04 +0100
Subject: [PATCH 52/96] fix read me file

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index 82ca7a041..c134ac2ea 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -10,6 +10,7 @@ To work with multiple streaming nodes, 2 hook are required for:
 #### Supported file formats
 
 Supported file formats are:
+
 - Text
 - CSV
 - JSON

From f08dd095845d67d3ce8167a2be88ff4a5b78e93f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?=
 <juan_luis_cano@mckinsey.com>
Date: Thu, 4 May 2023 10:56:14 +0200
Subject: [PATCH 53/96] docs: Add community contributions (#199)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add community contributions

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Use newer link to docs

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
---
 kedro-datasets/RELEASE.md               | 5 +++++
 kedro-datasets/kedro_datasets/README.md | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index ddc06407c..bd1d1e73c 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -7,6 +7,11 @@
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
 
+## Community contributions
+Many thanks to the following Kedroids for contributing PRs to this release:
+
+* [BrianCechmanek](https://github.com/BrianCechmanek)
+
 # Release 1.2.1:
 
 ## Major features and improvements:
diff --git a/kedro-datasets/kedro_datasets/README.md b/kedro-datasets/kedro_datasets/README.md
index 53360c747..8e2344a30 100644
--- a/kedro-datasets/kedro_datasets/README.md
+++ b/kedro-datasets/kedro_datasets/README.md
@@ -10,7 +10,7 @@ These data descriptions are supported with the APIs of `pandas`, `spark`, `netwo
 
 [The Data Catalog](https://kedro.readthedocs.io/en/stable/data/data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop.
 
-Here is a full list of [supported data descriptions and APIs](https://kedro.readthedocs.io/en/stable/kedro.datasets.html).
+Here is a full list of [supported data descriptions and APIs](https://docs.kedro.org/en/stable/kedro_datasets.html).
 
 ## How can I create my own `AbstractDataSet` implementation?
 

From 24bb52741330df4533e60a03daf76c7f6861bc4e Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 4 May 2023 18:05:32 +0100
Subject: [PATCH 54/96] adding test for raise error

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../tests/spark/test_spark_streaming_dataset.py    | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index d8b9a1c77..67d217d30 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -4,6 +4,7 @@
 from kedro.io.core import DataSetError
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+from pyspark.sql.utils import AnalysisException
 
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
@@ -88,3 +89,16 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.exists()
+
+    def test_exists_raises_error(self, mocker):
+        # exists should raise all errors except for
+        # AnalysisExceptions clearly indicating a missing file
+        spark_data_set = SparkStreamingDataSet(filepath="")
+        mocker.patch.object(
+            spark_data_set,
+            "_get_spark",
+            side_effect=AnalysisException("Other Exception", []),
+        )
+
+        with pytest.raises(DataSetError, match="Other Exception"):
+            spark_data_set.exists()

From 437e77e7025390338768bc60ce69b9c596a3b2fc Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 4 May 2023 11:23:39 +0100
Subject: [PATCH 55/96] update test and remove redundacy

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py  | 12 ++++++------
 .../tests/spark/test_spark_streaming_dataset.py      |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 8c6fa21f4..63632929a 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -3,15 +3,15 @@
 from pathlib import PurePosixPath
 from typing import Any, Dict
 
-from kedro.io.core import AbstractDataSet
+from kedro.io.core import (
+    AbstractDataSet,
+)
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark.spark_dataset import (
-    SparkDataSet,
-    _split_filepath,
-    _strip_dbfs_prefix,
-)
+from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
+from kedro_datasets.spark.spark_dataset import SparkDataSet
+
 
 
 class SparkStreamingDataSet(AbstractDataSet):
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 67d217d30..cc9a5ab4b 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -4,7 +4,6 @@
 from kedro.io.core import DataSetError
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-from pyspark.sql.utils import AnalysisException
 
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
@@ -89,6 +88,7 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.exists()
+<<<<<<< HEAD
 
     def test_exists_raises_error(self, mocker):
         # exists should raise all errors except for
@@ -102,3 +102,5 @@ def test_exists_raises_error(self, mocker):
 
         with pytest.raises(DataSetError, match="Other Exception"):
             spark_data_set.exists()
+=======
+>>>>>>> d1472e2 (update test and remove redundacy)

From a3fdbf6fb8880dbaf580493c847cb11d28099322 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 4 May 2023 11:32:28 +0100
Subject: [PATCH 56/96] linting

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 63632929a..8c6fa21f4 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -3,15 +3,15 @@
 from pathlib import PurePosixPath
 from typing import Any, Dict
 
-from kedro.io.core import (
-    AbstractDataSet,
-)
+from kedro.io.core import AbstractDataSet
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.utils import AnalysisException
 
-from kedro_datasets.spark.spark_dataset import _split_filepath, _strip_dbfs_prefix
-from kedro_datasets.spark.spark_dataset import SparkDataSet
-
+from kedro_datasets.spark.spark_dataset import (
+    SparkDataSet,
+    _split_filepath,
+    _strip_dbfs_prefix,
+)
 
 
 class SparkStreamingDataSet(AbstractDataSet):

From 9d60f25a55726048e1329d701be32b90c5ef3044 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 12:08:38 +0100
Subject: [PATCH 57/96] refactor file format

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index c134ac2ea..82ca7a041 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -10,7 +10,6 @@ To work with multiple streaming nodes, 2 hook are required for:
 #### Supported file formats
 
 Supported file formats are:
-
 - Text
 - CSV
 - JSON

From ced007dbf24641de42a6a24552e9a2c64e594c03 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 14:58:04 +0100
Subject: [PATCH 58/96] fix read me file

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index 82ca7a041..c134ac2ea 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -10,6 +10,7 @@ To work with multiple streaming nodes, 2 hook are required for:
 #### Supported file formats
 
 Supported file formats are:
+
 - Text
 - CSV
 - JSON

From 0b88324eb0c8bebef9f97505723a76e68d3a68ab Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 4 May 2023 18:05:32 +0100
Subject: [PATCH 59/96] adding test for raise error

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index cc9a5ab4b..5abffe3f5 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -4,6 +4,7 @@
 from kedro.io.core import DataSetError
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+from pyspark.sql.utils import AnalysisException
 
 from kedro_datasets.spark.spark_dataset import SparkDataSet
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
@@ -88,7 +89,6 @@ def test_save(self, tmp_path, sample_spark_streaming_df):
 
         streaming_ds.save(loaded_with_streaming)
         assert streaming_ds.exists()
-<<<<<<< HEAD
 
     def test_exists_raises_error(self, mocker):
         # exists should raise all errors except for
@@ -101,6 +101,4 @@ def test_exists_raises_error(self, mocker):
         )
 
         with pytest.raises(DataSetError, match="Other Exception"):
-            spark_data_set.exists()
-=======
->>>>>>> d1472e2 (update test and remove redundacy)
+            spark_data_set.exists()
\ No newline at end of file

From ed26aadc46f2209360397b6fd3aa67ba98139f98 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 20:20:41 +0100
Subject: [PATCH 60/96] fix readme file

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index c134ac2ea..3979b6935 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -11,11 +11,11 @@ To work with multiple streaming nodes, 2 hook are required for:
 
 Supported file formats are:
 
-- Text
-- CSV
-- JSON
-- ORC
-- Parquet
+1. Text
+1. CSV
+1. JSON
+1. ORC
+1. Parquet
 
 #### Example SparkStreamsHook:
 

From 170b09297b5bfd81f259ce81f24ef816ca121121 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 20:23:09 +0100
Subject: [PATCH 61/96] fix readme

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index 3979b6935..a09165f14 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -11,11 +11,11 @@ To work with multiple streaming nodes, 2 hook are required for:
 
 Supported file formats are:
 
-1. Text
-1. CSV
-1. JSON
-1. ORC
-1. Parquet
+- Text
+-  CSV
+- JSON
+- ORC
+- Parquet
 
 #### Example SparkStreamsHook:
 

From e63a53acd8dd8282c99fbae3ebe1e6837d7c01e1 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 21:33:29 +0100
Subject: [PATCH 62/96] fix conflicts

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 8c6fa21f4..e63893306 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -155,4 +155,4 @@ def _exists(self) -> bool:
             ):
                 return False
             raise
-        return True
+        return True
\ No newline at end of file

From d986c7521e833e166c8531747bef7cb44c0888bf Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Thu, 4 May 2023 21:38:27 +0100
Subject: [PATCH 63/96] fix ci erors

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +-
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index e63893306..8c6fa21f4 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -155,4 +155,4 @@ def _exists(self) -> bool:
             ):
                 return False
             raise
-        return True
\ No newline at end of file
+        return True
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 5abffe3f5..67d217d30 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -101,4 +101,4 @@ def test_exists_raises_error(self, mocker):
         )
 
         with pytest.raises(DataSetError, match="Other Exception"):
-            spark_data_set.exists()
\ No newline at end of file
+            spark_data_set.exists()

From 64232fa609bdf2b0e63d4bc3006dbbd0953d71e8 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Fri, 5 May 2023 11:07:00 +0100
Subject: [PATCH 64/96] fix lint issue

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-telemetry/kedro_telemetry/plugin.py | 1 +
 kedro-telemetry/tests/test_masking.py     | 1 +
 kedro-telemetry/tests/test_plugin.py      | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kedro-telemetry/kedro_telemetry/plugin.py b/kedro-telemetry/kedro_telemetry/plugin.py
index 1027d541d..5eeb4d489 100644
--- a/kedro-telemetry/kedro_telemetry/plugin.py
+++ b/kedro-telemetry/kedro_telemetry/plugin.py
@@ -22,6 +22,7 @@
 from kedro.framework.startup import ProjectMetadata
 from kedro.io.data_catalog import DataCatalog
 from kedro.pipeline import Pipeline
+
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
 from kedro_telemetry.masking import _get_cli_structure, _mask_kedro_cli
 
diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py
index 1e674096b..74773e2f4 100644
--- a/kedro-telemetry/tests/test_masking.py
+++ b/kedro-telemetry/tests/test_masking.py
@@ -9,6 +9,7 @@
 from kedro import __version__ as kedro_version
 from kedro.framework.cli.cli import KedroCLI, cli
 from kedro.framework.startup import ProjectMetadata
+
 from kedro_telemetry.masking import (
     MASK,
     _get_cli_structure,
diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py
index 9b1a6460b..222bcc914 100644
--- a/kedro-telemetry/tests/test_plugin.py
+++ b/kedro-telemetry/tests/test_plugin.py
@@ -9,6 +9,8 @@
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import node
 from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline
+from pytest import fixture
+
 from kedro_telemetry import __version__ as TELEMETRY_VERSION
 from kedro_telemetry.plugin import (
     KedroTelemetryCLIHooks,
@@ -16,7 +18,6 @@
     _check_for_telemetry_consent,
     _confirm_consent,
 )
-from pytest import fixture
 
 REPO_NAME = "dummy_project"
 PACKAGE_NAME = "dummy_package"

From 8a61b41798b73ffd4d98b23f82c6b579af9523e1 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Fri, 5 May 2023 14:07:13 +0100
Subject: [PATCH 65/96] update class documentation

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../spark/spark_streaming_dataset.py          | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 8c6fa21f4..567c4405c 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -21,19 +21,16 @@ class SparkStreamingDataSet(AbstractDataSet):
     data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
     .. code-block:: yaml
         raw.new_inventory:
-            type: spark.SparkStreamingDataSet
-            filepath: data/01_raw/stream/inventory/
-            file_format: json
-        int.new_inventory:
-            type: spark.SparkStreamingDataSet
-            filepath: data/02_intermediate/inventory/
-            file_format: csv
-            save_args:
-                output_mode: append
-                checkpoint: data/04_checkpoint/int_new_inventory
-                header: True
-            load_args:
-                header: True
+        type: streaming.extras.datasets.spark_streaming_dataset.SparkStreamingDataSet
+        filepath: data/01_raw/stream/inventory/
+        file_format: json
+        save_args:
+            output_mode: append
+            checkpoint: data/04_checkpoint/raw_new_inventory
+            header: True
+        load_args:
+            schema:
+            filepath: data/01_raw/schema/inventory_schema.json
     """
 
     # pylint: disable=too-many-instance-attributes

From 37e66e8c603b48272ec1771181e6162505fc2a53 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 16 May 2023 21:14:44 +0100
Subject: [PATCH 66/96] add additional test cases

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../spark/test_spark_streaming_dataset.py     | 73 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 67d217d30..4d920980f 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -1,7 +1,9 @@
 import json
 
+import boto3
 import pytest
 from kedro.io.core import DataSetError
+from moto import mock_s3
 from pyspark.sql import SparkSession
 from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 from pyspark.sql.utils import AnalysisException
@@ -10,9 +12,12 @@
 from kedro_datasets.spark.spark_streaming_dataset import SparkStreamingDataSet
 
 SCHEMA_FILE_NAME = "schema.json"
+BUCKET_NAME = "test_bucket"
+AWS_CREDENTIALS = {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}
 
 
 def sample_schema(schema_path):
+    """read the schema file from json path"""
     with open(schema_path, encoding="utf-8") as f:
         try:
             return StructType.fromJson(json.loads(f.read()))
@@ -24,18 +29,51 @@ def sample_schema(schema_path):
 
 
 @pytest.fixture
-def sample_spark_streaming_df(tmp_path):
-    schema = StructType(
+def sample_spark_df_schema() -> StructType:
+    """Spark Dataframe schema"""
+    return StructType(
         [
             StructField("sku", StringType(), True),
             StructField("new_stock", IntegerType(), True),
         ]
     )
+
+
+@pytest.fixture
+def sample_spark_streaming_df(tmp_path, sample_spark_df_schema):
+    """Create s sample dataframe for streaming"""
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
     schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
     with open(schema_path, "w", encoding="utf-8") as f:
-        json.dump(schema.jsonValue(), f)
-    return SparkSession.builder.getOrCreate().createDataFrame(data, schema)
+        json.dump(sample_spark_df_schema.jsonValue(), f)
+    return SparkSession.builder.getOrCreate().createDataFrame(
+        data, sample_spark_df_schema
+    )
+
+
+@pytest.fixture
+def mocked_s3_bucket():
+    """Create a bucket for testing using moto."""
+    with mock_s3():
+        conn = boto3.client(
+            "s3",
+            aws_access_key_id="fake_access_key",
+            aws_secret_access_key="fake_secret_key",
+        )
+        conn.create_bucket(Bucket=BUCKET_NAME)
+        yield conn
+
+
+@pytest.fixture
+def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructType):
+    """Creates schema file and adds it to mocked S3 bucket."""
+    temporary_path = tmp_path / SCHEMA_FILE_NAME
+    temporary_path.write_text(sample_spark_df_schema.json(), encoding="utf-8")
+
+    mocked_s3_bucket.put_object(
+        Bucket=BUCKET_NAME, Key=SCHEMA_FILE_NAME, Body=temporary_path.read_bytes()
+    )
+    return mocked_s3_bucket
 
 
 class TestStreamingDataSet:
@@ -57,6 +95,33 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         schema = sample_schema(schema_path)
         assert streaming_ds.schema == schema
 
+    @pytest.mark.usefixtures("mocked_s3_schema")
+    def test_load_options_schema_path_with_credentials(
+        self, tmp_path, sample_spark_streaming_df
+    ):
+        filepath = (tmp_path / "test_streams").as_posix()
+        schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
+
+        spark_json_ds = SparkDataSet(
+            filepath=filepath, file_format="json", save_args=[{"mode", "overwrite"}]
+        )
+        spark_json_ds.save(sample_spark_streaming_df)
+
+        streaming_ds = SparkStreamingDataSet(
+            filepath=filepath,
+            file_format="json",
+            load_args={
+                "schema": {
+                    "filepath": f"s3://{BUCKET_NAME}/{SCHEMA_FILE_NAME}",
+                    "credentials": AWS_CREDENTIALS,
+                }
+            },
+        ).load()
+
+        assert streaming_ds.isStreaming
+        schema = sample_schema(schema_path)
+        assert streaming_ds.schema == schema
+
     def test_save(self, tmp_path, sample_spark_streaming_df):
         filepath_json = (tmp_path / "test_streams").as_posix()
         filepath_output = (tmp_path / "test_streams_output").as_posix()

From 07032a8f3c89a8cd51eb59febec403203ed4b3a3 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 16 May 2023 21:36:37 +0100
Subject: [PATCH 67/96] add s3 read test cases

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../spark/test_spark_streaming_dataset.py     | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 4d920980f..38a33337d 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -64,6 +64,15 @@ def mocked_s3_bucket():
         yield conn
 
 
+@pytest.fixture
+def s3_bucket():
+    with mock_s3():
+        s3 = boto3.resource("s3", region_name="us-east-1")
+        bucket_name = "test-bucket"
+        s3.create_bucket(Bucket=bucket_name)
+        yield bucket_name
+
+
 @pytest.fixture
 def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructType):
     """Creates schema file and adds it to mocked S3 bucket."""
@@ -95,6 +104,28 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         schema = sample_schema(schema_path)
         assert streaming_ds.schema == schema
 
+    def test_read_dataframe_from_s3(
+        self, tmp_path, sample_spark_streaming_df, s3_bucket
+    ):
+
+        s3_path = f"s3://{s3_bucket}/test-data"
+        schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
+
+        spark_json_ds = SparkDataSet(
+            filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}]
+        )
+        spark_json_ds.save(sample_spark_streaming_df)
+
+        streaming_ds = SparkStreamingDataSet(
+            filepath=s3_path,
+            file_format="json",
+            load_args={"schema": {"filepath": schema_path}},
+        ).load()
+
+        assert streaming_ds.isStreaming
+        schema = sample_schema(schema_path)
+        assert streaming_ds.schema == schema
+
     @pytest.mark.usefixtures("mocked_s3_schema")
     def test_load_options_schema_path_with_credentials(
         self, tmp_path, sample_spark_streaming_df

From 2470de19eede465d0de9568850d5dce34c3dbb99 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 16 May 2023 21:49:51 +0100
Subject: [PATCH 68/96] add s3 read test cases

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 38a33337d..cae0742b6 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -108,7 +108,7 @@ def test_read_dataframe_from_s3(
         self, tmp_path, sample_spark_streaming_df, s3_bucket
     ):
 
-        s3_path = f"s3://{s3_bucket}/test-data"
+        s3_path = f"s3a://{s3_bucket}/test-data/01_raw/*"
         schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
 
         spark_json_ds = SparkDataSet(

From c4e0f4e2cc8125ab94463ad74b91a5e22471fa55 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 16 May 2023 22:12:44 +0100
Subject: [PATCH 69/96] add s3 read test case

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../tests/spark/test_spark_streaming_dataset.py          | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index cae0742b6..52966deb0 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -111,10 +111,11 @@ def test_read_dataframe_from_s3(
         s3_path = f"s3a://{s3_bucket}/test-data/01_raw/*"
         schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
 
-        spark_json_ds = SparkDataSet(
-            filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}]
-        )
-        spark_json_ds.save(sample_spark_streaming_df)
+        # spark_json_ds = SparkDataSet(
+        #     filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}]
+        # )
+        # spark_json_ds.save(sample_spark_streaming_df)
+        sample_spark_streaming_df.write.json(s3_path)
 
         streaming_ds = SparkStreamingDataSet(
             filepath=s3_path,

From 7e3555e80501076074e0d7eb162058ac34b881e4 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 16 May 2023 22:18:58 +0100
Subject: [PATCH 70/96] test s3 read

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 52966deb0..203afcefa 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -115,6 +115,7 @@ def test_read_dataframe_from_s3(
         #     filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}]
         # )
         # spark_json_ds.save(sample_spark_streaming_df)
+
         sample_spark_streaming_df.write.json(s3_path)
 
         streaming_ds = SparkStreamingDataSet(

From 6a0029dcd20846f5f88e9adaa7b41061c5fc314e Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Wed, 17 May 2023 15:28:18 +0100
Subject: [PATCH 71/96] remove redundant test cases

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../spark/test_spark_streaming_dataset.py     | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 203afcefa..9b91ab56f 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -104,30 +104,6 @@ def test_load(self, tmp_path, sample_spark_streaming_df):
         schema = sample_schema(schema_path)
         assert streaming_ds.schema == schema
 
-    def test_read_dataframe_from_s3(
-        self, tmp_path, sample_spark_streaming_df, s3_bucket
-    ):
-
-        s3_path = f"s3a://{s3_bucket}/test-data/01_raw/*"
-        schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
-
-        # spark_json_ds = SparkDataSet(
-        #     filepath=s3_path, file_format="json", save_args=[{"mode", "overwrite"}]
-        # )
-        # spark_json_ds.save(sample_spark_streaming_df)
-
-        sample_spark_streaming_df.write.json(s3_path)
-
-        streaming_ds = SparkStreamingDataSet(
-            filepath=s3_path,
-            file_format="json",
-            load_args={"schema": {"filepath": schema_path}},
-        ).load()
-
-        assert streaming_ds.isStreaming
-        schema = sample_schema(schema_path)
-        assert streaming_ds.schema == schema
-
     @pytest.mark.usefixtures("mocked_s3_schema")
     def test_load_options_schema_path_with_credentials(
         self, tmp_path, sample_spark_streaming_df

From e8f6696efa1f3b015e08d61697b3884b3f97a65b Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 23 May 2023 21:35:56 +0100
Subject: [PATCH 72/96] fix streaming dataset configurations

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py | 13 ++++++-------
 .../tests/spark/test_spark_streaming_dataset.py     |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 567c4405c..b4c9cb68c 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -67,7 +67,6 @@ def __init__(
                 write documentation:
                 https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
         """
-        self._filepath_ = filepath
         self._file_format = file_format
         self._save_args = save_args
         self._load_args = load_args
@@ -102,8 +101,7 @@ def _describe(self) -> Dict[str, Any]:
 
     @staticmethod
     def _get_spark():
-        spark = SparkSession.builder.getOrCreate()
-        return spark
+        return SparkSession.builder.getOrCreate()
 
     def _load(self) -> DataFrame:
         """Loads data from filepath.
@@ -112,27 +110,28 @@ def _load(self) -> DataFrame:
         Returns:
             Data from filepath as pyspark dataframe.
         """
-        input_constructor = (
+        load_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
+        data_stream_reader = (
             self._get_spark()
             .readStream.schema(self._schema)
             .format(self._file_format)
             .options(**self._load_args)
         )
-        return input_constructor.load(self._filepath_)
+        return data_stream_reader.load(load_path)
 
     def _save(self, data: DataFrame) -> None:
         """Saves pyspark dataframe.
         Args:
             data: PySpark streaming dataframe for saving
         """
-
+        save_path = _strip_dbfs_prefix(self._fs_prefix + str(self._filepath))
         output_constructor = data.writeStream.format(self._file_format)
 
         (
             output_constructor.option(
                 "checkpointLocation", self._save_args.pop("checkpoint")
             )
-            .option("path", self._filepath_)
+            .option("path", save_path)
             .outputMode(self._save_args.pop("output_mode"))
             .options(**self._save_args)
             .start()
diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index 9b91ab56f..b4e1f0414 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -85,7 +85,7 @@ def mocked_s3_schema(tmp_path, mocked_s3_bucket, sample_spark_df_schema: StructT
     return mocked_s3_bucket
 
 
-class TestStreamingDataSet:
+class TestSparkStreamingDataSet:
     def test_load(self, tmp_path, sample_spark_streaming_df):
         filepath = (tmp_path / "test_streams").as_posix()
         schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()

From 9a5ebad1ae4e4d0d64eaf1239f92ac7180b85e08 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 25 May 2023 16:51:58 +0100
Subject: [PATCH 73/96] update streaming datasets doc

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py        | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index b4c9cb68c..0f7e841ed 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -30,10 +30,9 @@ class SparkStreamingDataSet(AbstractDataSet):
             header: True
         load_args:
             schema:
-            filepath: data/01_raw/schema/inventory_schema.json
+              filepath: data/01_raw/schema/inventory_schema.json
     """
 
-    # pylint: disable=too-many-instance-attributes
     DEFAULT_LOAD_ARGS = {}  # type: Dict[str, Any]
     DEFAULT_SAVE_ARGS = {}  # type: Dict[str, Any]
 
@@ -58,7 +57,9 @@ def __init__(
                 It is dependent on the selected file format. You can find
                 a list of read options for each supported format
                 in Spark DataFrame read documentation:
-                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
+                https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html,
+                Please note that a schema is mandatory for a streaming DataFrame if schemaInference
+                is not True.
             save_args: Save args passed to Spark DataFrame write options.
                 Similar to load_args this is dependent on the selected file
                 format. You can pass ``mode`` and ``partitionBy`` to specify

From eacdd461043beb9bf44342f1d5b237f5fdb86fdb Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 25 May 2023 16:58:06 +0100
Subject: [PATCH 74/96] resolve comments re documentation

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index a09165f14..bdc62c9c4 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -3,7 +3,7 @@
 ``SparkStreamingDataSet`` loads and saves data to streaming DataFrames.
 See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
 
-To work with multiple streaming nodes, 2 hook are required for:
+To work with multiple streaming nodes, 2 hooks are required for:
     - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details
     - Running streaming query without termination unless exception
 
@@ -11,8 +11,8 @@ To work with multiple streaming nodes, 2 hook are required for:
 
 Supported file formats are:
 
-- Text
--  CSV
+- Text 
+- CSV
 - JSON
 - ORC
 - Parquet
@@ -33,7 +33,7 @@ class SparkStreamsHook:
         spark = SparkSession.builder.getOrCreate()
         spark.streams.awaitAnyTermination()
 ```
-To make the application work with kafka format, respective spark configuration need to be added in ``conf/base/spark.yml``.
+To make the application work with Kafka format, the respective spark configuration needs to be added to``conf/base/spark.yml``.
 
 #### Example spark.yml:
 

From 68b6e1bfdc17812c143cce2a0b374cc165497a99 Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 25 May 2023 17:04:10 +0100
Subject: [PATCH 75/96] bugfix lint

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index bdc62c9c4..f8df9e94f 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -11,7 +11,7 @@ To work with multiple streaming nodes, 2 hooks are required for:
 
 Supported file formats are:
 
-- Text 
+- Text
 - CSV
 - JSON
 - ORC

From 5b2a479cc5a3fb28ab725cd210bc04a62f7d7dfc Mon Sep 17 00:00:00 2001
From: Tingting_Wan <tingting_wan@mckinsey.com>
Date: Thu, 25 May 2023 17:15:01 +0100
Subject: [PATCH 76/96] update link

Signed-off-by: Tingting_Wan <tingting_wan@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/README.md b/kedro-datasets/kedro_datasets/spark/README.md
index f8df9e94f..7400c3c47 100644
--- a/kedro-datasets/kedro_datasets/spark/README.md
+++ b/kedro-datasets/kedro_datasets/spark/README.md
@@ -4,7 +4,7 @@
 See [Spark Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) for details.
 
 To work with multiple streaming nodes, 2 hooks are required for:
-    - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/tools_integration/pyspark.html) for details
+    - Integrating Pyspark, see [Build a Kedro pipeline with PySpark](https://docs.kedro.org/en/stable/integrations/pyspark_integration.html) for details
     - Running streaming query without termination unless exception
 
 #### Supported file formats

From b94f2116e312e71161fb5aa12b34b5bf8ea4a79a Mon Sep 17 00:00:00 2001
From: Nok Chan <nok.lam.chan@quantumblack.com>
Date: Fri, 26 May 2023 15:43:22 +0100
Subject: [PATCH 77/96] revert the changes on CI

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
---
 tools/circleci/circleci_release.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py
index e8f5d8449..dd05d4c5a 100755
--- a/tools/circleci/circleci_release.py
+++ b/tools/circleci/circleci_release.py
@@ -8,6 +8,7 @@
 
 import requests
 from requests.structures import CaseInsensitiveDict
+
 from utils.check_no_version_pypi import check_no_version_pypi
 from utils.package_version import get_package_version
 

From 9381816f9d7120d7cdc83b60135556199d5a6bef Mon Sep 17 00:00:00 2001
From: Nok Lam Chan <mediumnok@gmail.com>
Date: Wed, 17 May 2023 16:48:52 +0100
Subject: [PATCH 78/96] test(docker): remove outdated logging-related step
 (#207)

* fixkedro- docker e2e test

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* fix: add timeout to request to satisfy bandit lint

---------

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-docker/features/docker.feature            |  1 -
 kedro-docker/features/docker_with_spark.feature |  1 -
 kedro-docker/features/steps/cli_steps.py        | 15 ---------------
 3 files changed, 17 deletions(-)

diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature
index 7dffa4541..74580213e 100644
--- a/kedro-docker/features/docker.feature
+++ b/kedro-docker/features/docker.feature
@@ -3,7 +3,6 @@ Feature: Docker commands in new projects
   Background:
     Given I have prepared a config file
     And I run a non-interactive kedro new using pandas-iris starter
-    And I have fixed logs write permission
     And I have installed the project dependencies
     And I have removed old docker image of test project
 
diff --git a/kedro-docker/features/docker_with_spark.feature b/kedro-docker/features/docker_with_spark.feature
index a10116476..012ad2595 100644
--- a/kedro-docker/features/docker_with_spark.feature
+++ b/kedro-docker/features/docker_with_spark.feature
@@ -3,7 +3,6 @@ Feature: Docker commands in new Spark projects
   Background:
     Given I have prepared a config file
     And I run a non-interactive kedro new using pyspark-iris starter
-    And I have fixed logs write permission
     And I have installed the project dependencies
     And I have removed old docker image of test project
 
diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py
index 0306c1e2f..30b80f749 100644
--- a/kedro-docker/features/steps/cli_steps.py
+++ b/kedro-docker/features/steps/cli_steps.py
@@ -126,21 +126,6 @@ def create_configuration_file(context):
         yaml.dump(config, config_file, default_flow_style=False)
 
 
-@given("I have fixed logs write permission")
-def modify_write_permission(context):
-    """
-    Kedro-docker mounts some subdirectories the current directory (like logs, notebooks etc)
-    into the Docker container.
-    If you run kedro commands with different users,
-    they might create files and directories not writable by each other.
-    So we are fixing the permissions here.
-    """
-    (context.root_project_dir / "logs").chmod(0o777)
-    journal_dir = context.root_project_dir / "logs" / "journals"
-    journal_dir.mkdir(parents=True, exist_ok=True)
-    journal_dir.chmod(0o777)
-
-
 @given("I run a non-interactive kedro new using {starter_name} starter")
 def create_project_from_config_file(context, starter_name):
     """Behave step to run kedro new

From 373e166ba7b5eb13c73089f51d0a3b29c3f7f23f Mon Sep 17 00:00:00 2001
From: Deepyaman Datta <deepyaman.datta@utexas.edu>
Date: Thu, 18 May 2023 09:12:01 -0400
Subject: [PATCH 79/96] ci: ensure plugin requirements get installed in CI
 (#208)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ci: install the plugin alongside test requirements

* ci: install the plugin alongside test requirements

* Update kedro-airflow.yml

* Update kedro-datasets.yml

* Update kedro-docker.yml

* Update kedro-telemetry.yml

* Update kedro-airflow.yml

* Update kedro-datasets.yml

* Update kedro-airflow.yml

* Update kedro-docker.yml

* Update kedro-telemetry.yml

* ci(telemetry): update isort config to correct sort

* Don't use profile ¯\_(ツ)_/¯

Signed-off-by: Deepyaman Datta <deepyaman.datta@utexas.edu>

* chore(datasets): remove empty `tool.black` section

* chore(docker): remove empty `tool.black` section

---------

Signed-off-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .circleci/continue_config.yml         |  7 +++----
 .github/workflows/check-plugin.yml    |  6 +++---
 .github/workflows/kedro-airflow.yml   | 14 +++++++++-----
 .github/workflows/kedro-datasets.yml  | 14 +++++++++-----
 .github/workflows/kedro-docker.yml    | 14 +++++++++-----
 .github/workflows/kedro-telemetry.yml | 14 +++++++++-----
 kedro-datasets/pyproject.toml         |  2 --
 kedro-docker/pyproject.toml           |  2 --
 kedro-telemetry/pyproject.toml        |  4 +---
 9 files changed, 43 insertions(+), 34 deletions(-)

diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
index 82653758e..d339e82c1 100644
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@@ -69,8 +69,7 @@ commands:
           command: |
             cd <<parameters.plugin>>
             pip install git+https://github.com/kedro-org/kedro@main
-            pip install .
-            pip install -r test_requirements.txt
+            pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
       - run:
           name: Install pre-commit hooks
           command: |
@@ -177,7 +176,7 @@ commands:
           command: conda activate kedro_plugins; pip install git+https://github.com/kedro-org/kedro@main
       - run:
           name: Install all requirements
-          command: conda activate kedro_plugins; cd <<parameters.plugin>>; pip install -r test_requirements.txt -U
+          command: conda activate kedro_plugins; cd <<parameters.plugin>>; pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
       - run:
           name: Pip freeze
           command: conda activate kedro_plugins; pip freeze
@@ -323,7 +322,7 @@ jobs:
       - run:
           name: Maybe trigger the release workflow
           command: |
-            conda activate kedro_plugins;
+            conda activate kedro_plugins
             pip install requests
             ./tools/circleci/circleci_release.py
 
diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml
index 4a3cf8827..29266046d 100644
--- a/.github/workflows/check-plugin.yml
+++ b/.github/workflows/check-plugin.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Install dependencies
         run: |
           cd ${{ inputs.plugin }}
-          pip install -r test_requirements.txt
+          pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
       - name: pip freeze
         run: pip freeze
       - name: Run unit tests for Linux / all plugins
@@ -84,7 +84,7 @@ jobs:
         run: |
             cd ${{ inputs.plugin }}
             pip install git+https://github.com/kedro-org/kedro@main
-            pip install -r test_requirements.txt
+            pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
             pip freeze
       - name: Install pre-commit hooks
         run: |
@@ -121,7 +121,7 @@ jobs:
         run: |
           cd ${{ inputs.plugin }}
           pip install git+https://github.com/kedro-org/kedro@main
-          pip install -r test_requirements.txt
+          pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
       - name: pip freeze
         run: pip freeze
       - name: Run end to end tests
diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml
index b68fcce30..d4e696061 100644
--- a/.github/workflows/kedro-airflow.yml
+++ b/.github/workflows/kedro-airflow.yml
@@ -1,12 +1,16 @@
-name: Run checks on kedro-airflow
+name: Run checks on Kedro-Airflow
 
 on:
   push:
-    paths:
-      - "kedro-airflow/**"
+    paths-ignore:
+       - "kedro-datasets/**"
+       - "kedro-docker/**"
+       - "kedro-telemetry/**"
   pull_request:
-    paths:
-      - "kedro-airflow/**"
+    paths-ignore:
+       - "kedro-datasets/**"
+       - "kedro-docker/**"
+       - "kedro-telemetry/**"
     types: [ synchronize ]
 
 jobs:
diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml
index 9ff4802b6..1b25f711b 100644
--- a/.github/workflows/kedro-datasets.yml
+++ b/.github/workflows/kedro-datasets.yml
@@ -1,12 +1,16 @@
-name: Run checks on kedro-datasets
+name: Run checks on Kedro-Datasets
 
 on:
   push:
-    paths:
-      - "kedro-datasets/**"
+    paths-ignore:
+       - "kedro-airflow/**"
+       - "kedro-docker/**"
+       - "kedro-telemetry/**"
   pull_request:
-    paths:
-      - "kedro-datasets/**"
+    paths-ignore:
+       - "kedro-airflow/**"
+       - "kedro-docker/**"
+       - "kedro-telemetry/**"
     types: [ synchronize ]
 
 jobs:
diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml
index 1812a3a93..4231ca545 100644
--- a/.github/workflows/kedro-docker.yml
+++ b/.github/workflows/kedro-docker.yml
@@ -1,12 +1,16 @@
-name: Run checks on kedro-docker
+name: Run checks on Kedro-Docker
 
 on:
   push:
-    paths:
-      - "kedro-docker/**"
+    paths-ignore:
+       - "kedro-airflow/**"
+       - "kedro-datasets/**"
+       - "kedro-telemetry/**"
   pull_request:
-    paths:
-      - "kedro-docker/**"
+    paths-ignore:
+       - "kedro-airflow/**"
+       - "kedro-datasets/**"
+       - "kedro-telemetry/**"
     types: [ synchronize ]
 
 jobs:
diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml
index fd75e8a71..ce5b82743 100644
--- a/.github/workflows/kedro-telemetry.yml
+++ b/.github/workflows/kedro-telemetry.yml
@@ -1,12 +1,16 @@
-name: Run checks on kedro-telemetry
+name: Run checks on Kedro-Telemetry
 
 on:
   push:
-    paths:
-      - "kedro-telemetry/**"
+    paths-ignore:
+       - "kedro-airflow/**"
+       - "kedro-datasets/**"
+       - "kedro-docker/**"
   pull_request:
-    paths:
-      - "kedro-telemetry/**"
+    paths-ignore:
+       - "kedro-airflow/**"
+       - "kedro-datasets/**"
+       - "kedro-docker/**"
     types: [ synchronize ]
 
 jobs:
diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
index a5f494106..11cf1a157 100644
--- a/kedro-datasets/pyproject.toml
+++ b/kedro-datasets/pyproject.toml
@@ -27,8 +27,6 @@ include = ["kedro_datasets*"]
 readme = {file = "README.md", content-type = "text/markdown"}
 version = {attr = "kedro_datasets.__version__"}
 
-[tool.black]
-
 [tool.isort]
 profile = "black"
 
diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml
index cdd273509..0a9639956 100644
--- a/kedro-docker/pyproject.toml
+++ b/kedro-docker/pyproject.toml
@@ -53,8 +53,6 @@ addopts = """
     --no-cov-on-fail
     -ra"""
 
-[tool.black]
-
 [tool.isort]
 multi_line_output = 3
 include_trailing_comma = true
diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml
index 0cc754854..81ec2c60b 100644
--- a/kedro-telemetry/pyproject.toml
+++ b/kedro-telemetry/pyproject.toml
@@ -42,6 +42,4 @@ include_trailing_comma = true
 force_grid_wrap = 0
 use_parentheses = true
 line_length = 88
-known_third_party = "kedro"
-
-[tool.black]
+known_first_party = "kedro_telemetry"

From f033b951d0b41d9b60bf7dd0d7ca3c38dcc84745 Mon Sep 17 00:00:00 2001
From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com>
Date: Thu, 18 May 2023 14:18:00 +0100
Subject: [PATCH 80/96] ci: Migrate the release workflow from CircleCI to
 GitHub Actions (#203)

* Create check-release.yml

* change from test pypi to pypi

* split into jobs and move version logic into script

* update github actions output

* lint

* changes based on review

* changes based on review

* fix script to not append continuously

* change pypi api token logic

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .github/workflows/check-release.yml           | 93 +++++++++++++++++++
 .../github_actions/github_actions_release.py  | 54 +++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 .github/workflows/check-release.yml
 create mode 100755 tools/github_actions/github_actions_release.py

diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml
new file mode 100644
index 000000000..386810bbd
--- /dev/null
+++ b/.github/workflows/check-release.yml
@@ -0,0 +1,93 @@
+name: Check versions and build-publish
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  check-version:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install requests
+    - name: Check version
+      run: python tools/github_actions/github_actions_release.py
+    - name: Set outputs
+      id: version_check
+      run: |
+        echo "new_release=${{ env.NEW_RELEASE }}" >> $GITHUB_OUTPUT
+        echo "package_name=${{ env.PACKAGE_NAME }}" >> $GITHUB_OUTPUT
+        echo "package_version=${{ env.PACKAGE_VERSION }}" >> $GITHUB_OUTPUT
+    outputs:
+      new_release: ${{ steps.version_check.outputs.new_release }}
+      package_name: ${{ steps.version_check.outputs.package_name }}
+      package_version: ${{ steps.version_check.outputs.package_version }}
+
+  test:
+    needs: check-version
+    if: ${{ needs.check-version.outputs.new_release == 'true' }}
+    uses: ./.github/workflows/check-plugin.yml
+    with:
+      plugin: ${{ needs.check-version.outputs.package_name }}
+
+  build-publish:
+    needs: [check-version, test]
+    if: ${{ needs.check-version.outputs.new_release == 'true' }}
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.10'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: |
+        export plugin=${{ needs.check-version.outputs.package_name }}
+        make package
+    - name: Create GitHub Release
+      uses: actions/github-script@v6
+      with:
+        github-token: ${{ secrets.GH_TAGGING_TOKEN }}
+        script: |
+          const package_name = "${{ needs.check-version.outputs.package_name }}"
+          const package_version = "${{ needs.check-version.outputs.package_version }}"
+          const response = await github.rest.repos.createRelease({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            tag_name: `${package_name}-${package_version}`,
+            target_commitish: 'main',
+            name: `${package_name}-${package_version}`,
+            body: `Release ${package_version}`,
+            draft: false,
+            prerelease: false,
+          });
+          return response.data;
+    - name: Set PyPI token
+      run: |
+        if [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-airflow" ]; then
+          echo 'PYPI_TOKEN=${{ secrets.AIRFLOW_PYPI_TOKEN }}' >> $GITHUB_ENV
+        elif [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-datasets" ]; then
+          echo 'PYPI_TOKEN=${{ secrets.DATASETS_PYPI_TOKEN }}' >> $GITHUB_ENV
+        elif [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-docker" ]; then
+          echo 'PYPI_TOKEN=${{ secrets.DOCKER_PYPI_TOKEN }}' >> $GITHUB_ENV
+        elif [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-telemetry" ]; then
+          echo 'PYPI_TOKEN=${{ secrets.TELEMETRY_PYPI_TOKEN }}' >> $GITHUB_ENV
+        fi
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        packages-dir: ${{ needs.check-version.outputs.package_name }}/dist
+        password: ${{ env.PYPI_TOKEN }}
+
diff --git a/tools/github_actions/github_actions_release.py b/tools/github_actions/github_actions_release.py
new file mode 100755
index 000000000..cec1a8b97
--- /dev/null
+++ b/tools/github_actions/github_actions_release.py
@@ -0,0 +1,54 @@
+import os
+import sys
+import re
+import requests
+from pathlib import Path
+
+VERSION_MATCHSTR = r'\s*__version__\s*=\s*"(\d+\.\d+\.\d+)"'
+PACKAGE_PATHS = (
+    "kedro-datasets/kedro_datasets",
+    "kedro-telemetry/kedro_telemetry",
+    "kedro-airflow/kedro_airflow",
+    "kedro-docker/kedro_docker",
+)
+
+
+def get_package_version(base_path, package_path):
+    init_file_path = Path(base_path) / package_path / "__init__.py"
+    match_obj = re.search(VERSION_MATCHSTR, Path(init_file_path).read_text())
+    return match_obj.group(1)
+
+
+def check_no_version_pypi(pypi_endpoint, package_name, package_version):
+    print(f"Check if {package_name} {package_version} is on pypi")
+    response = requests.get(pypi_endpoint, timeout=10)
+    if response.status_code == 404:
+        # Version doesn't exist on Pypi - do release
+        print(f"Starting the release of {package_name} {package_version}")
+        return True
+    else:
+        print(f"Skipped: {package_name} {package_version} already exists on PyPI")
+        return False
+
+
+if __name__ == "__main__":
+    """Check if a package needs to be released"""
+    base_path = Path()
+    new_release = "false"
+    package_name = None
+    package_version = None
+
+    for package_path in PACKAGE_PATHS:
+        package_name, _ = package_path.split("/")
+        package_version = get_package_version(base_path, package_path)
+        pypi_endpoint = f"https://pypi.org/pypi/{package_name}/{package_version}/json/"
+
+        if check_no_version_pypi(pypi_endpoint, package_name, package_version):
+            new_release = "true"
+            break
+
+    env_file = os.getenv('GITHUB_ENV')
+    with open(env_file, "a") as env_file:
+        env_file.write(f"NEW_RELEASE={new_release}\n")
+        if new_release == "true":
+            env_file.write(f"PACKAGE_NAME={package_name}\nPACKAGE_VERSION={package_version}\n")

From 3fdb71c1cfc1d41618fab5c216ac12da785d271e Mon Sep 17 00:00:00 2001
From: Merel Theisen <49397448+merelcht@users.noreply.github.com>
Date: Thu, 18 May 2023 14:52:32 +0100
Subject: [PATCH 81/96] build: Relax Kedro bound for `kedro-datasets` (#140)

* Less strict pin on Kedro for datasets

Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/RELEASE.md     | 2 ++
 kedro-datasets/pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index bd1d1e73c..2dbee5adc 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -4,6 +4,8 @@
 * Added pandas 2.0 support.
 * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
 * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
+* Relaxed Kedro version pin to `>=0.16`
+
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
 
diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
index 11cf1a157..457c18bc6 100644
--- a/kedro-datasets/pyproject.toml
+++ b/kedro-datasets/pyproject.toml
@@ -11,7 +11,7 @@ description = "Kedro-Datasets is where you can find all of Kedro's data connecto
 requires-python = ">=3.7, <3.11"
 license = {text = "Apache Software License (Apache 2.0)"}
 dependencies = [
-    "kedro~=0.18.4",
+    "kedro>=0.16",
 ]
 dynamic = ["readme", "version", "optional-dependencies"]
 

From b08aa6f617f783576b75fc84eee3c960e970c7c6 Mon Sep 17 00:00:00 2001
From: Deepyaman Datta <deepyaman.datta@utexas.edu>
Date: Thu, 18 May 2023 10:37:30 -0400
Subject: [PATCH 82/96] ci: don't run checks on both `push`/`pull_request`
 (#192)

* ci: don't run checks on both `push`/`pull_request`

* ci: don't run checks on both `push`/`pull_request`

* ci: don't run checks on both `push`/`pull_request`

* ci: don't run checks on both `push`/`pull_request`

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .github/workflows/kedro-airflow.yml   | 17 ++++++++++-------
 .github/workflows/kedro-datasets.yml  | 17 ++++++++++-------
 .github/workflows/kedro-docker.yml    | 17 ++++++++++-------
 .github/workflows/kedro-telemetry.yml | 17 ++++++++++-------
 4 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml
index d4e696061..ef0c87ef9 100644
--- a/.github/workflows/kedro-airflow.yml
+++ b/.github/workflows/kedro-airflow.yml
@@ -2,16 +2,19 @@ name: Run checks on Kedro-Airflow
 
 on:
   push:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-datasets/**"
-       - "kedro-docker/**"
-       - "kedro-telemetry/**"
+      - "kedro-datasets/**"
+      - "kedro-docker/**"
+      - "kedro-telemetry/**"
   pull_request:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-datasets/**"
-       - "kedro-docker/**"
-       - "kedro-telemetry/**"
-    types: [ synchronize ]
+      - "kedro-datasets/**"
+      - "kedro-docker/**"
+      - "kedro-telemetry/**"
 
 jobs:
   airflow-test:
diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml
index 1b25f711b..943453ee7 100644
--- a/.github/workflows/kedro-datasets.yml
+++ b/.github/workflows/kedro-datasets.yml
@@ -2,16 +2,19 @@ name: Run checks on Kedro-Datasets
 
 on:
   push:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-airflow/**"
-       - "kedro-docker/**"
-       - "kedro-telemetry/**"
+      - "kedro-airflow/**"
+      - "kedro-docker/**"
+      - "kedro-telemetry/**"
   pull_request:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-airflow/**"
-       - "kedro-docker/**"
-       - "kedro-telemetry/**"
-    types: [ synchronize ]
+      - "kedro-airflow/**"
+      - "kedro-docker/**"
+      - "kedro-telemetry/**"
 
 jobs:
   datasets-test:
diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml
index 4231ca545..71a77cb24 100644
--- a/.github/workflows/kedro-docker.yml
+++ b/.github/workflows/kedro-docker.yml
@@ -2,16 +2,19 @@ name: Run checks on Kedro-Docker
 
 on:
   push:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-airflow/**"
-       - "kedro-datasets/**"
-       - "kedro-telemetry/**"
+      - "kedro-airflow/**"
+      - "kedro-datasets/**"
+      - "kedro-telemetry/**"
   pull_request:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-airflow/**"
-       - "kedro-datasets/**"
-       - "kedro-telemetry/**"
-    types: [ synchronize ]
+      - "kedro-airflow/**"
+      - "kedro-datasets/**"
+      - "kedro-telemetry/**"
 
 jobs:
   docker-test:
diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml
index ce5b82743..f53841bde 100644
--- a/.github/workflows/kedro-telemetry.yml
+++ b/.github/workflows/kedro-telemetry.yml
@@ -2,16 +2,19 @@ name: Run checks on Kedro-Telemetry
 
 on:
   push:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-airflow/**"
-       - "kedro-datasets/**"
-       - "kedro-docker/**"
+      - "kedro-airflow/**"
+      - "kedro-datasets/**"
+      - "kedro-docker/**"
   pull_request:
+    branches:
+      - main
     paths-ignore:
-       - "kedro-airflow/**"
-       - "kedro-datasets/**"
-       - "kedro-docker/**"
-    types: [ synchronize ]
+      - "kedro-airflow/**"
+      - "kedro-datasets/**"
+      - "kedro-docker/**"
 
 jobs:
   telemetry-test:

From 148b464ef5df6bb83b5fa69ed14121c49ceb69de Mon Sep 17 00:00:00 2001
From: Deepyaman Datta <deepyaman.datta@utexas.edu>
Date: Fri, 19 May 2023 11:54:31 -0400
Subject: [PATCH 83/96] chore: delete extra space ending check-release.yml
 (#210)

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .github/workflows/check-release.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml
index 386810bbd..916cf70f7 100644
--- a/.github/workflows/check-release.yml
+++ b/.github/workflows/check-release.yml
@@ -90,4 +90,3 @@ jobs:
       with:
         packages-dir: ${{ needs.check-version.outputs.package_name }}/dist
         password: ${{ env.PYPI_TOKEN }}
-

From be2431c0cec4c316815aec1c9a162a5b38090801 Mon Sep 17 00:00:00 2001
From: Nok Lam Chan <mediumnok@gmail.com>
Date: Fri, 19 May 2023 17:17:41 +0100
Subject: [PATCH 84/96] ci: Create merge-gatekeeper.yml to make sure PR only
 merged when all tests checked. (#215)

* Create merge-gatekeeper.yml

* Update .github/workflows/merge-gatekeeper.yml

---------

Co-authored-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .github/workflows/merge-gatekeeper.yml | 27 ++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/workflows/merge-gatekeeper.yml

diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml
new file mode 100644
index 000000000..be615ecbd
--- /dev/null
+++ b/.github/workflows/merge-gatekeeper.yml
@@ -0,0 +1,27 @@
+name: Merge Gatekeeper
+
+on:
+  pull_request:
+    branches:
+      - main
+      - develop
+
+jobs:
+  merge-gatekeeper:
+    runs-on: ubuntu-latest
+    # Restrict permissions of the GITHUB_TOKEN.
+    # Docs: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs
+    permissions:
+      checks: read
+      statuses: read
+    steps:
+      - name: Run Merge Gatekeeper
+        # NOTE: v1 is updated to reflect the latest v1.x.y. Please use any tag/branch that suits your needs:
+        #       https://github.com/upsidr/merge-gatekeeper/tags
+        #       https://github.com/upsidr/merge-gatekeeper/branches
+        uses: upsidr/merge-gatekeeper@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          timeout: 1800
+          interval: 30
+          

From 74a211f8b774fb6813579552e2e4e2b280121147 Mon Sep 17 00:00:00 2001
From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com>
Date: Fri, 19 May 2023 19:00:51 +0100
Subject: [PATCH 85/96] ci: Remove the CircleCI setup (#209)

* remove circleci setup files and utils

* remove circleci configs in kedro-telemetry

* remove redundant .github in kedro-telemetry

* Delete continue_config.yml

* Update check-release.yml

* lint

* increase timeout to 40 mins for docker e2e tests

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .circleci/config.yml                          |  38 --
 .circleci/continue_config.yml                 | 516 ------------------
 .github/workflows/merge-gatekeeper.yml        |   3 +-
 kedro-telemetry/.circleci/config.yml          | 131 -----
 .../.github/ISSUE_TEMPLATE/bug-report.md      |  41 --
 .../.github/ISSUE_TEMPLATE/feature-request.md |  20 -
 .../.github/PULL_REQUEST_TEMPLATE.md          |  13 -
 tools/circleci/circleci_release.py            |  78 ---
 tools/circleci/github_release.py              |  52 --
 tools/circleci/utils/check_no_version_pypi.py |  13 -
 tools/circleci/utils/package_version.py       |  21 -
 11 files changed, 1 insertion(+), 925 deletions(-)
 delete mode 100644 .circleci/config.yml
 delete mode 100644 .circleci/continue_config.yml
 delete mode 100644 kedro-telemetry/.circleci/config.yml
 delete mode 100644 kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md
 delete mode 100644 kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md
 delete mode 100644 kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md
 delete mode 100755 tools/circleci/circleci_release.py
 delete mode 100755 tools/circleci/github_release.py
 delete mode 100644 tools/circleci/utils/check_no_version_pypi.py
 delete mode 100644 tools/circleci/utils/package_version.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index b8a27e1c3..000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-version: 2.1
-
-parameters:
-  release_package:
-    type: string
-    default: ""
-  release_version:
-    type: string
-    default: ""
-
-setup: true
-
-# the path-filtering orb is required to continue a pipeline based on
-# the path of an updated fileset
-orbs:
-  path-filtering: circleci/path-filtering@0.1.1
-
-workflows:
-  always-run:
-    jobs:
-      # the path-filtering/filter job determines which pipeline
-      # parameters to update.
-      - path-filtering/filter:
-          name: check-updated-files
-          # 3-column, whitespace-delimited mapping. One mapping per
-          # line:
-          # <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
-          mapping: |
-            kedro-telemetry/.* run-build-kedro-telemetry true
-            kedro-docker/.* run-build-kedro-docker true
-            kedro-airflow/.* run-build-kedro-airflow true
-            kedro-datasets/.* run-build-kedro-datasets true
-          base-revision: main
-          # this is the path of the configuration we should trigger once
-          # path filtering and pipeline parameter value updates are
-          # complete. In this case, we are using the parent dynamic
-          # configuration itself.
-          config-path: .circleci/continue_config.yml
diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
deleted file mode 100644
index d339e82c1..000000000
--- a/.circleci/continue_config.yml
+++ /dev/null
@@ -1,516 +0,0 @@
-version: 2.1
-
-orbs:
-  win: circleci/windows@2.4.1
-
-# the default pipeline parameters, which will be updated according to
-# the results of the path-filtering orb
-parameters:
-  run-build-kedro-telemetry:
-    type: boolean
-    default: false
-  run-build-kedro-docker:
-    type: boolean
-    default: false
-  run-build-kedro-airflow:
-    type: boolean
-    default: false
-  run-build-kedro-datasets:
-    type: boolean
-    default: false
-  release_package:
-    type: string
-    default: ""
-  release_version:
-    type: string
-    default: ""
-
-commands:
-  setup_conda:
-    parameters:
-      python_version:
-        type: string
-    steps:
-      - run:
-          name: Cleanup pyenv
-          command: sudo rm -rf .pyenv/ /opt/circleci/.pyenv/
-      - run:
-          name: Download and install miniconda
-          command: |
-            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  > miniconda.sh
-            bash miniconda.sh -b -p $HOME/miniconda
-      - run:
-          name: Create conda environment with correct python version
-          command: |
-            . /home/circleci/miniconda/etc/profile.d/conda.sh
-            conda create --name kedro_plugins python=<<parameters.python_version>> -y
-      - run:
-          name: Setup bash env to run conda activation at each step
-          command: |
-            echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV
-            echo "conda deactivate; conda activate kedro_plugins" >> $BASH_ENV
-            source $BASH_ENV
-
-  setup_requirements:
-    parameters:
-      plugin:
-        type: string
-    steps:
-      - run:
-          name: Install pip setuptools
-          command: make install-pip-setuptools
-      - run:
-          # pytables does not work properly with python 3.9 to handle our HDFDataSet
-          # if pip-installed, so we install this dependency via conda
-          name: Install pytables
-          command: conda install -c conda-forge pytables -y
-      - run:
-          name: Install kedro and test requirements
-          command: |
-            cd <<parameters.plugin>>
-            pip install git+https://github.com/kedro-org/kedro@main
-            pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
-      - run:
-          name: Install pre-commit hooks
-          command: |
-            cd <<parameters.plugin>>
-            pre-commit install --install-hooks
-            pre-commit install --hook-type pre-push
-      - run:
-          # this is needed to fix java cacerts so
-          # spark can automatically download packages from mvn
-          # https://stackoverflow.com/a/50103533/1684058
-          name: Fix cacerts
-          command: |
-            sudo rm /etc/ssl/certs/java/cacerts
-            sudo update-ca-certificates -f
-      - run:
-          # Since recently Spark installation for some reason does not have enough permissions to execute
-          # /home/circleci/miniconda/envs/kedro_plugins/lib/python3.X/site-packages/pyspark/bin/spark-class.
-          # So fixing it manually here.
-          name: Fix Spark permissions
-          command: sudo chmod -R u+x /home/circleci/miniconda/envs/kedro_plugins/lib/
-      - run:
-          name: Pip freeze
-          command: pip freeze
-
-  setup:
-    parameters:
-      python_version:
-        type: string
-      plugin:
-        type: string
-    steps:
-      - checkout
-      - setup_conda:
-          python_version: <<parameters.python_version>>
-      - setup_requirements:
-          plugin: <<parameters.plugin>>
-
-  # Windows specific commands
-  win_setup_conda:
-    # Miniconda3 is pre-installed on the machine:
-    # https://circleci.com/docs/2.0/hello-world-windows
-    parameters:
-      python_version:
-        type: string
-    steps:
-      - run:
-          name: Initialize conda
-          command: conda init powershell
-      - run:
-          name: Create 'kedro_plugins' conda environment
-          command: conda create --name kedro_plugins python=<<parameters.python_version>> -y
-
-
-  win_setup_env:
-    steps:
-      - run:
-          # Required for Tensorflow tests
-          name: Install Microsoft Visual C++ Redistributable
-          command: |
-            $ProgressPreference = "SilentlyContinue"
-            Invoke-WebRequest https://aka.ms/vs/16/release/vc_redist.x64.exe -OutFile vc_redist.x64.exe
-            .\vc_redist.x64.exe /S /v/qn
-      - run:
-          name: Install Java 8
-          command: |
-            $ProgressPreference = "SilentlyContinue"
-            Invoke-WebRequest https://github.com/AdoptOpenJDK/openjdk8-upstream-binaries/releases/download/jdk8u252-b09/OpenJDK8U-jdk_x64_windows_8u252b09.zip -OutFile OpenJDK8U.zip
-            Expand-Archive .\OpenJDK8U.zip -DestinationPath C:\OpenJDK8U
-      - run:
-          name: Create Inbound rules for Java
-          command: |
-            New-NetFirewallRule -DisplayName "Allow JDK UDP" -Profile "Public" -Protocol "UDP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow
-            New-NetFirewallRule -DisplayName "Allow JDK TCP" -Profile "Public" -Protocol "TCP" -Direction Inbound -Program "C:\OpenJDK8U\openjdk-8u252-b09\bin\java.exe" -Action Allow
-      - run:
-          name: Set Java environment variables
-          command: |
-            [Environment]::SetEnvironmentVariable("Path", [Environment]::GetEnvironmentVariable('Path', 'Machine') + ";C:\OpenJDK8U\openjdk-8u252-b09\bin", "Machine")
-            setx /m JAVA_HOME "C:\OpenJDK8U\openjdk-8u252-b09"
-      - run:
-          name: Setup Hadoop binary
-          command: |
-            $ProgressPreference = "SilentlyContinue"
-            Invoke-WebRequest https://github.com/steveloughran/winutils/raw/master/hadoop-2.6.3/bin/winutils.exe -OutFile winutils.exe
-            New-Item -ItemType directory -Path C:\hadoop\bin
-            mv .\winutils.exe C:\hadoop\bin
-            setx /m HADOOP_HOME "C:\hadoop\"
-      - run:
-          name: Install 'make' command
-          command: choco install make
-
-
-  win_setup_requirements:
-    parameters:
-      plugin:
-        type: string
-      python_version:
-        type: string
-    steps:
-      - run:
-          name: Install GDAL, Fiona and pytables
-          command: conda activate kedro_plugins; conda install gdal fiona pytables -c conda-forge -y
-      - run:
-          name: Install Kedro
-          command: conda activate kedro_plugins; pip install git+https://github.com/kedro-org/kedro@main
-      - run:
-          name: Install all requirements
-          command: conda activate kedro_plugins; cd <<parameters.plugin>>; pip install . -r test_requirements.txt  # TODO(deepyaman): Define `test` extra and `pip install .[test]`
-      - run:
-          name: Pip freeze
-          command: conda activate kedro_plugins; pip freeze
-
-jobs:
-  unit_tests:
-    parameters:
-      python_version:
-        type: string
-      plugin:
-        type: string
-    machine:
-      image: ubuntu-2004:202201-02
-      docker_layer_caching: true
-    steps:
-      - setup:
-          python_version: <<parameters.python_version>>
-          plugin: <<parameters.plugin>>
-      - run:
-          name: Run unit tests
-          command: make plugin=<<parameters.plugin>> test
-
-  e2e_tests:
-    parameters:
-      python_version:
-        type: string
-      plugin:
-        type: string
-    machine:
-      image: ubuntu-2004:202201-02
-      docker_layer_caching: true
-    steps:
-      - setup:
-          python_version: <<parameters.python_version>>
-          plugin: <<parameters.plugin>>
-      - run:
-          name: Run e2e tests
-          command: make plugin=<<parameters.plugin>> e2e-tests
-
-  lint:
-    parameters:
-      plugin:
-        type: string
-    machine:
-      image: ubuntu-2004:202201-02
-      docker_layer_caching: true
-    steps:
-      - setup:
-          python_version: "3.8"
-          plugin: <<parameters.plugin>>
-      - run:
-          name: Run pylint and flake8
-          command: make plugin=<<parameters.plugin>> lint
-
-  win_unit_tests:
-    parameters:
-      python_version:
-        type: string
-      plugin:
-        type: string
-    executor:
-      name: win/default
-    steps:
-      - checkout
-      - win_setup_conda:
-          python_version: <<parameters.python_version>>
-      - win_setup_env
-      - win_setup_requirements:
-          plugin: <<parameters.plugin>>
-          python_version: <<parameters.python_version>>
-      # For anything not `kedro-datasets`
-      - unless:
-          condition:
-            equal: ["kedro-datasets", <<parameters.plugin>>]
-
-          # e2e tests are not currently runnable on CircleCI on Windows as
-          # those require the ability to run Linux containers:
-          # "The Windows executor currently only supports Windows containers.
-          # Running Linux containers on Windows is not possible for now"
-          # (from https://circleci.com/docs/2.0/hello-world-windows/)
-          steps:
-            - run:
-                name: Run unit tests
-                command: |
-                  conda activate kedro_plugins
-                  cd <<parameters.plugin>>
-                  pytest tests
-
-      - run:
-          # geopandas and tensorflow conflicts when imported simultaneously.
-          # The HDF5 header files used to compile this application do not match
-          # the version used by the HDF5 library to which this application is linked.
-          # Data corruption or segmentation faults may occur if the application continues.
-          # This can happen when an application was compiled by one version of HDF5 but
-          # linked with a different version of static or shared HDF5 library.
-          # You should recompile the application or check your shared library related
-          # settings such as 'LD_LIBRARY_PATH'.
-          # You can, at your own risk, disable this warning by setting the environment
-          # variable 'HDF5_DISABLE_VERSION_CHECK' to a value of '1'.
-          # Setting it to 2 or higher will suppress the warning messages totally.
-          name: Set HDF5_DISABLE_VERSION_CHECK environment variable
-          command: setx /m HDF5_DISABLE_VERSION_CHECK 1
-      - when:
-          condition:
-            and:
-              - not:
-                  equal: [ "3.10", <<parameters.python_version>> ]
-              - equal: [ "kedro-datasets", <<parameters.plugin>> ]
-          steps:
-            - run:
-                name: Run unit tests without spark in parallel
-                command: conda activate kedro_plugins; make test-no-spark
-      - when:
-          condition:
-            and:
-              - equal: [ "3.10", <<parameters.python_version>> ]
-              - equal: [ "kedro-datasets", <<parameters.plugin>> ]
-          steps:
-            - run:
-                name: Run unit tests without spark sequentially
-                command: conda activate kedro_plugins; make test-no-spark-sequential
-
-  sync:
-    parameters:
-      python_version:
-        type: string
-    docker:
-      # https://circleci.com/docs/2.0/circleci-images/#circleci-base-image
-      - image: cimg/base:2020.01
-    steps:
-      - checkout
-      - add_ssh_keys
-      - run:
-          name: Set git email and name
-          command: |
-            git config --global user.email "kedro@kedro.com"
-            git config --global user.name "Kedro"
-      # - run:
-      #     name: Trigger Read The Docs build
-      #     command: ./tools/circleci/rtd-build.sh ${RTD_TOKEN} latest
-      - setup_conda:
-          python_version: <<parameters.python_version>>
-      - run:
-          name: Maybe trigger the release workflow
-          command: |
-            conda activate kedro_plugins
-            pip install requests
-            ./tools/circleci/circleci_release.py
-
-
-  # This is effectively just a combination of the lint, unit_tests and e2e_tests jobs.
-  # It's used to check that the nightly docker image is working ok and before publishing a release.
-  build_package:
-    parameters:
-      python_version:
-        type: string
-    machine:
-      image: ubuntu-2004:202201-02
-      docker_layer_caching: true
-    steps:
-      - setup:
-          python_version: <<parameters.python_version>>  # Just need one Python version here
-          plugin: <<pipeline.parameters.release_package>>
-      - run:
-          name: Run linters
-          command: export plugin=<<pipeline.parameters.release_package>>; make lint
-      - unless:
-          condition:
-            equal: ["3.10", <<parameters.python_version>>]
-          steps:
-            - run:
-                name: Run unit tests in parallel
-                command: export plugin=<<pipeline.parameters.release_package>>; make test
-      - when:
-          condition:
-            equal: [ "3.10", <<parameters.python_version>> ]
-          steps:
-            - run:
-                name: Run unit tests sequentially
-                command: export plugin=<<pipeline.parameters.release_package>>; make test-sequential
-      - run:
-          name: Run e2e tests
-          command: make plugin=<<pipeline.parameters.release_package>> e2e-tests
-
-  publish_package:
-    machine:
-      image: ubuntu-2004:202201-02
-      docker_layer_caching: true
-    steps:
-      - run:
-          name: Print the release package and version
-          command: |
-              echo "Release package: <<pipeline.parameters.release_package>> <<pipeline.parameters.release_version>>"
-      - setup:
-          python_version: "3.8"  # Just need one Python version here
-          plugin: <<pipeline.parameters.release_package>>  # From circle_release.py
-      - add_ssh_keys
-      - run:
-          name: Tag and publish release on Github
-          command: ./tools/circleci/github_release.py <<pipeline.parameters.release_package>> <<pipeline.parameters.release_version>>
-      - run:
-          name: Publish to PyPI
-          command: |
-            export plugin=<<pipeline.parameters.release_package>>
-            make package
-            make pypi
-
-
-workflows:
-  # when pipeline parameter, run-build-kedro-telemetry is true, the
-  # kedro-telemetry job is triggered.
-  kedro-telemetry:
-    when:
-      and:
-        - <<pipeline.parameters.run-build-kedro-telemetry>>
-        - not: <<pipeline.parameters.release_package>>
-        - not: <<pipeline.parameters.release_version>>
-    jobs:
-      - unit_tests:
-          plugin: "kedro-telemetry"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - win_unit_tests:
-          plugin: "kedro-telemetry"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - lint:
-          plugin: "kedro-telemetry"
-  # when pipeline parameter, run-build-kedro-docker is true, the
-  # kedro-docker job is triggered.
-  kedro-docker:
-    when:
-      and:
-        -  <<pipeline.parameters.run-build-kedro-docker>>
-        - not: <<pipeline.parameters.release_package>>
-        - not: <<pipeline.parameters.release_version>>
-    jobs:
-      - unit_tests:
-          plugin: "kedro-docker"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - e2e_tests:
-          plugin: "kedro-docker"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - win_unit_tests:
-          plugin: "kedro-docker"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - lint:
-          plugin: "kedro-docker"
-  # when pipeline parameter, run-build-kedro-airflow is true, the
-  # kedro-airflow job is triggered.
-  kedro-airflow:
-    when:
-      and:
-        - <<pipeline.parameters.run-build-kedro-airflow>>
-        - not: <<pipeline.parameters.release_package>>
-        - not: <<pipeline.parameters.release_version>>
-    jobs:
-      - unit_tests:
-          plugin: "kedro-airflow"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-          pre-steps:
-          - run:
-              name: Avoid GPL dependency (unidecode)
-              command: echo 'export SLUGIFY_USES_TEXT_UNIDECODE=yes' >> $BASH_ENV
-      - e2e_tests:
-          plugin: "kedro-airflow"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - win_unit_tests:
-          plugin: "kedro-airflow"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - lint:
-          plugin: "kedro-airflow"
-  # when pipeline parameter, run-build-kedro-datasets is true, the
-  # kedro-datasets job is triggered.
-  kedro-datasets:
-    when:
-      and:
-        - <<pipeline.parameters.run-build-kedro-datasets>>
-        - not: <<pipeline.parameters.release_package>>
-        - not: <<pipeline.parameters.release_version>>
-    jobs:
-      - unit_tests:
-          plugin: "kedro-datasets"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - win_unit_tests:
-          plugin: "kedro-datasets"
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - lint:
-          plugin: "kedro-datasets"
-
-  # For release
-  main_updated:
-    when:
-      and:
-        - not: <<pipeline.parameters.release_package>>
-        - not: <<pipeline.parameters.release_version>>
-    jobs:
-      - sync:
-          filters:
-            branches:
-              only: main
-          matrix:
-            # We just need one Python enviornment to trigger the job
-            parameters:
-              python_version: ["3.8"]
-
-  package_release:
-    when:
-      and:
-        -  <<pipeline.parameters.release_package>>
-        -  <<pipeline.parameters.release_version>>
-    jobs:
-      - build_package:
-          matrix:
-            parameters:
-              python_version: ["3.7", "3.8", "3.9", "3.10"]
-      - publish_package:
-          requires:
-            - build_package
diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml
index be615ecbd..77ad752de 100644
--- a/.github/workflows/merge-gatekeeper.yml
+++ b/.github/workflows/merge-gatekeeper.yml
@@ -22,6 +22,5 @@ jobs:
         uses: upsidr/merge-gatekeeper@v1
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
-          timeout: 1800
+          timeout: 2400
           interval: 30
-          
diff --git a/kedro-telemetry/.circleci/config.yml b/kedro-telemetry/.circleci/config.yml
deleted file mode 100644
index d0915db48..000000000
--- a/kedro-telemetry/.circleci/config.yml
+++ /dev/null
@@ -1,131 +0,0 @@
-version: 2.1
-
-orbs:
-  win: circleci/windows@2.4.0
-
-commands:
-  # Windows-related commands
-  win_setup_conda:
-    # Miniconda3 is pre-installed on the machine:
-    # https://circleci.com/docs/2.0/hello-world-windows/
-    description: Setup conda
-    steps:
-      - run:
-          name: Initialize conda
-          command: conda init powershell
-      - run:
-          name: Create 'kedro-telemetry' conda environment
-          command: |
-            conda create --name kedro-telemetry python=$env:CONDA_ENV_PY_VERSION -y
-
-  win_setup_requirements:
-    description: Install kedro-telemetry dependencies
-    steps:
-      - run:
-          name: Install kedro-telemetry dependencies
-          command: |
-            conda activate kedro-telemetry
-            python -m pip install -U pip setuptools wheel
-            pip install git+https://github.com/kedro-org/kedro@main
-            pip install -r test_requirements.txt -U
-
-  win_build:
-    description: Run build on Windows
-    steps:
-      - checkout
-      - win_setup_conda
-      - win_setup_requirements
-      - run:
-          name: Run unit tests
-          command: |
-            conda activate kedro-telemetry
-            pytest .\tests
-
-jobs:
-  build_36: &DEFAULT
-    machine:
-      # Don't use 2018 image: https://discuss.circleci.com/t/24639/18
-      image: circleci/classic:201711-01
-      docker_layer_caching: true
-    environment:
-      CONDA_ENV_PY_VERSION: "3.6"
-    steps:
-      - checkout
-      - run:
-          name: Create virtual env
-          command: |
-            # Get rid of pyenv stuff
-            sudo rm -rf .pyenv/ /opt/circleci/.pyenv/
-            # Download and install miniconda
-            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  > miniconda.sh
-            bash miniconda.sh -b -p $HOME/miniconda
-            # Create an anaconda virtualenv for ${CONDA_ENV_PY_VERSION} and make that the default python interpreter
-            echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV
-            echo "conda deactivate; conda activate kedro-telemetry" >> $BASH_ENV
-            . /home/circleci/miniconda/etc/profile.d/conda.sh
-            conda create --name kedro-telemetry python=${CONDA_ENV_PY_VERSION} -y
-            source $BASH_ENV
-      - run:
-          name: Pip install dependencies
-          command: |
-            make install-pip-setuptools
-            pip install git+https://github.com/kedro-org/kedro
-            pip install -r test_requirements.txt
-            pre-commit install --install-hooks
-            pre-commit install --hook-type pre-push
-      - run:
-          name: Run pylint and flake8
-          command: |
-            make lint
-      - run:
-          name: Run tests
-          command: |
-            make test
-
-  build_37:
-    <<: *DEFAULT
-    environment:
-      CONDA_ENV_PY_VERSION: 3.7
-
-  build_38:
-    <<: *DEFAULT
-    environment:
-      CONDA_ENV_PY_VERSION: 3.8
-
-  # Windows-related jobs
-  win_build_36:
-    executor:
-      name: win/default
-    working_directory: ~/repo
-    environment:
-      CONDA_ENV_PY_VERSION: "3.6"
-    steps: [win_build]
-
-  win_build_37:
-    executor:
-      name: win/default
-    working_directory: ~/repo
-    environment:
-      CONDA_ENV_PY_VERSION: "3.7"
-    steps:
-      - win_build
-
-  win_build_38:
-    executor:
-      name: win/default
-    working_directory: ~/repo
-    environment:
-      CONDA_ENV_PY_VERSION: "3.8"
-    steps:
-      - win_build
-
-workflows:
-  version: 2
-  regular:
-    jobs:
-      - build_36
-      - build_37
-      - build_38
-      - win_build_36
-      - win_build_37
-      - win_build_38
diff --git a/kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md b/kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md
deleted file mode 100644
index f89bf88bf..000000000
--- a/kedro-telemetry/.github/ISSUE_TEMPLATE/bug-report.md
+++ /dev/null
@@ -1,41 +0,0 @@
----
-name: Bug report
-about: If something isn't working
-title: '<Title>'
-labels: 'Issue: Bug Report'
-assignees: ''
-
----
-
-## Description
-Short description of the problem here.
-
-## Context
-How has this bug affected you? What were you trying to accomplish?
-
-## Steps to Reproduce
-1. [First Step]
-2. [Second Step]
-3. [And so on...]
-
-## Expected Result
-Tell us what should happen.
-
-## Actual Result
-Tell us what happens instead.
-
-```
--- If you received an error, place it here.
-```
-
-```
--- Separate them if you have more than one.
-```
-
-## Your Environment
-Include as many relevant details about the environment in which you experienced the bug:
-
-* Kedro-telemetry version used (`pip show kedro-telemetry`):
-* Kedro version used (`pip show kedro` or `kedro -V`):
-* Python version used (`python -V`):
-* Operating system and version:
diff --git a/kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md b/kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md
deleted file mode 100644
index a7911c2f1..000000000
--- a/kedro-telemetry/.github/ISSUE_TEMPLATE/feature-request.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-name: Feature request
-about: Let us know if you have a feature request or enhancement
-title: '<Title>'
-labels: 'Issue: Feature Request'
-assignees: ''
-
----
-
-## Description
-Is your feature request related to a problem? A clear and concise description of what the problem is: "I'm always frustrated when ..."
-
-## Context
-Why is this change important to you? How would you use it? How can it benefit other users?
-
-## Possible Implementation
-(Optional) Suggest an idea for implementing the addition or change.
-
-## Possible Alternatives
-(Optional) Describe any alternative solutions or features you've considered.
diff --git a/kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md b/kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 9efb5746d..000000000
--- a/kedro-telemetry/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,13 +0,0 @@
-## Description
-<!-- Why was this PR created? -->
-
-## Development notes
-<!-- What have you changed, and how has this been tested? -->
-
-## Checklist
-
-- [ ] Read the [contributing](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md) guidelines
-- [ ] Opened this PR as a 'Draft Pull Request' if it is work-in-progress
-- [ ] Updated the documentation to reflect the code changes
-- [ ] Added a description of this change in the [`RELEASE.md`](https://github.com/kedro-org/kedro/blob/main/RELEASE.md) file
-- [ ] Added tests to cover my changes
diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py
deleted file mode 100755
index dd05d4c5a..000000000
--- a/tools/circleci/circleci_release.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-"""
-CircleCI pipeline to check if it needs to trigger a release
-"""
-
-import os
-import sys
-
-import requests
-from requests.structures import CaseInsensitiveDict
-
-from utils.check_no_version_pypi import check_no_version_pypi
-from utils.package_version import get_package_version
-
-PACKAGE_PATHS = (
-    "kedro-datasets/kedro_datasets",
-    "kedro-telemetry/kedro_telemetry",
-    "kedro-airflow/kedro_airflow",
-    "kedro-docker/kedro_docker",
-)
-PROJECT_SLUG = "github/kedro-org/kedro-plugins"
-# CIRCLE_BRANCH = "feat/cicd-auto-release"
-CIRCLE_BRANCH = os.environ.get("CIRCLE_BRANCH")
-
-
-def circleci_release(project_slug, payload, circle_endpoint, circle_release_token):
-    """Trigging the CircleCI Release Pipeline"""
-    # See https://circleci.com/docs/2.0/api-developers-guide
-    print("Starting the CircleCI Release Pipeline")
-    CIRCLE_ENDPOINT = f"https://circleci.com/api/v2/project/{project_slug}/pipeline"
-
-    headers = CaseInsensitiveDict()
-    headers["Content-Type"] = "application/json"
-    headers["Circle-Token"] = circle_release_token
-
-    resp = requests.post(circle_endpoint, headers=headers, json=payload, timeout=10)
-    return resp
-
-
-if __name__ == "__main__":
-    """Trigger the CircleCI Release Process"""
-    from pathlib import Path
-
-    # Personal API Tokens - https://circleci.com/docs/managing-api-tokens
-    CIRCLE_RELEASE_TOKEN = os.environ.get("CIRCLE_RELEASE_TOKEN")
-    if not CIRCLE_RELEASE_TOKEN:
-        raise ValueError("CIRCLE_RELEASE_TOKEN is not defined as envionrmnet variable.")
-
-    base_path = Path()
-    # Loop for all 4 repositories
-    for package_path in PACKAGE_PATHS:
-        package_name, _ = package_path.split("/")
-        package_version = get_package_version(base_path, package_path)
-        pypi_endpoint = f"https://pypi.org/pypi/{package_name}/{package_version}/json/"
-        circleci_endpoint = (
-            f"https://circleci.com/api/v2/project/{PROJECT_SLUG}/pipeline"
-        )
-        payload = {
-            "branch": CIRCLE_BRANCH,
-            "parameters": {
-                "release_package": package_name,
-                "release_version": package_version,
-            },
-        }
-
-        print(package_name, package_version)
-        if check_no_version_pypi(pypi_endpoint, package_name, package_version):
-            res = circleci_release(
-                PROJECT_SLUG, payload, circleci_endpoint, CIRCLE_RELEASE_TOKEN
-            )
-            print(f"Status Code: {resp.status_code}")
-            if resp.status_code == 201:
-                print("Creating CircleCI Pipeline successfully")
-            else:
-                print("Failed to create CircleCI Pipeline")
-            print(resp.content)
-            if resp.status_code != 201:
-                sys.exit(1)
diff --git a/tools/circleci/github_release.py b/tools/circleci/github_release.py
deleted file mode 100755
index d5bc3115c..000000000
--- a/tools/circleci/github_release.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python3
-import os
-import sys
-
-import requests
-from requests.structures import CaseInsensitiveDict
-
-GITHUB_USER = "kedro-org"
-GITHUB_REPO = "kedro-plugins"
-# On GitHub select "Settings" > "Developer Setting" -> "Personal access Token""
-GITHUB_TAGGING_TOKEN = os.environ.get("GITHUB_TAGGING_TOKEN")
-
-
-def github_release(
-    package_name,
-    version,
-    github_user=GITHUB_USER,
-    github_repo=GITHUB_REPO,
-    github_tagging_token=GITHUB_TAGGING_TOKEN,
-):
-    """Trigger the GitHub Release to create artifacts and tags"""
-    print("Starting GitHub Release")
-
-    github_endpoint = (
-        f"https://api.github.com/repos/{github_user}/{github_repo}/releases"
-    )
-    payload = {
-        "tag_name": f"{package_name}-{version}",  # kedro-datasets 0.0.1
-        "target_commitish": "main",
-        "name": f"{version}",
-        "body": f"Release {version}",
-        "draft": False,
-        "prerelease": False,
-    }
-
-    headers = CaseInsensitiveDict()
-    headers["Content-Type"] = "application/json"
-    headers["Authorization"] = f"token {github_tagging_token}"
-    resp = requests.post(github_endpoint, headers=headers, json=payload, timeout=10)
-    if resp.status_code == 200:
-        print("Create GitHub release successfully")
-        print(resp.content)
-    else:
-        print("Failed to create Github release")
-        print(resp.content)
-    return resp
-
-
-if __name__ == "__main__":
-    package_name = sys.argv[1]
-    package_version = sys.argv[2]
-    res = github_release(package_name, package_version)
diff --git a/tools/circleci/utils/check_no_version_pypi.py b/tools/circleci/utils/check_no_version_pypi.py
deleted file mode 100644
index 777f09c9a..000000000
--- a/tools/circleci/utils/check_no_version_pypi.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import requests
-
-
-def check_no_version_pypi(pypi_endpoint, package_name, package_version):
-    print("Check if {package_name} {package_version} is on pypi")
-    response = requests.get(pypi_endpoint, timeout=10)
-    if response.status_code == 404:
-        # Not exist on Pypi - do release
-        print(f"Starting the release of {package_name} {package_version}")
-        return True
-    else:
-        print(f"Skipped: {package_name} {package_version} already exists on PyPI")
-        return False
diff --git a/tools/circleci/utils/package_version.py b/tools/circleci/utils/package_version.py
deleted file mode 100644
index 48de594ff..000000000
--- a/tools/circleci/utils/package_version.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env python3
-"""
-Get version of Kedro
-"""
-
-import re
-from pathlib import Path
-
-VERSION_MATCHSTR = r'\s*__version__\s*=\s*"(\d+\.\d+\.\d+)"'
-
-
-def get_package_version(base_path, package_path):
-    init_file_path = Path(base_path) / package_path / "__init__.py"
-    match_obj = re.search(VERSION_MATCHSTR, Path(init_file_path).read_text())
-    return match_obj.group(1)
-
-
-if __name__ == "__main__":
-    base_path = Path()
-    package_path = "kedro-datasets/kedro_datasets"
-    print(get_package_version(base_path, package_path))

From 9d7820a79c0f0d3bee7d746570fad6cf86bfe466 Mon Sep 17 00:00:00 2001
From: McDonnellJoseph <90898184+McDonnellJoseph@users.noreply.github.com>
Date: Mon, 22 May 2023 12:01:29 +0200
Subject: [PATCH 86/96] feat: Dataset API add `save` method (#180)

* [FEAT] add save method to APIDataset

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] create save_args parameter for api_dataset

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] add tests for socket + http errors

Signed-off-by: <jmcdonnell@fieldbox.ai>
Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] check save data is json

Signed-off-by: <jmcdonnell@fieldbox.ai>
Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] clean code

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] handle different data types

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] test coverage for exceptions

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] add examples in APIDataSet docstring

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* sync APIDataSet  from kedro's `develop` (#184)

* Update APIDataSet

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Sync ParquetDataSet

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Sync Test

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Linting

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Revert Unnecessary ParquetDataSet Changes

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Sync release notes

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

---------

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] remove support for delete method

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] lint files

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] fix conflicts

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] remove fail save test

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] review suggestions

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [ENH] fix tests

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

* [FIX] reorder arguments

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>

---------

Signed-off-by: jmcdonnell <jmcdonnell@fieldbox.ai>
Signed-off-by: <jmcdonnell@fieldbox.ai>
Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
Co-authored-by: jmcdonnell <jmcdonnell@fieldbox.ai>
Co-authored-by: Nok Lam Chan <mediumnok@gmail.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/RELEASE.md                     |  10 +-
 .../kedro_datasets/api/api_dataset.py         | 138 ++++++++++++++----
 .../kedro_datasets/pandas/generic_dataset.py  |   2 -
 .../spark/spark_jdbc_dataset.py               |   1 -
 kedro-datasets/tests/api/test_api_dataset.py  | 138 ++++++++++++++++--
 5 files changed, 246 insertions(+), 43 deletions(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index 2dbee5adc..fd2a755ca 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -3,6 +3,8 @@
 ## Major features and improvements:
 * Added pandas 2.0 support.
 * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
+* Added a save method to the APIDataSet
+
 * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
 * Relaxed Kedro version pin to `>=0.16`
 
@@ -42,10 +44,10 @@ Many thanks to the following Kedroids for contributing PRs to this release:
 
 * Added the following new datasets:
 
-| Type                                 | Description                                                                | Location                      |
-| ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- |
-| `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` |
-| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` |
+| Type                             | Description                                                                                                           | Location                   |
+| -------------------------------- | --------------------------------------------------------------------------------------------------------------------- | -------------------------- |
+| `polars.CSVDataSet`              | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars`    |
+| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake.          | `kedro_datasets.snowflake` |
 
 ## Bug fixes and other changes
 * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library.
diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
index cb8f80d37..b1b93a7eb 100644
--- a/kedro-datasets/kedro_datasets/api/api_dataset.py
+++ b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -1,7 +1,9 @@
 """``APIDataSet`` loads the data from HTTP(S) APIs.
 It uses the python requests library: https://requests.readthedocs.io/en/latest/
 """
-from typing import Any, Dict, List, NoReturn, Tuple, Union
+import json as json_  # make pylint happy
+from copy import deepcopy
+from typing import Any, Dict, List, Tuple, Union
 
 import requests
 from kedro.io.core import AbstractDataSet, DataSetError
@@ -14,11 +16,10 @@
 
 
 class APIDataSet(AbstractDataSet[None, requests.Response]):
-    """``APIDataSet`` loads the data from HTTP(S) APIs.
+    """``APIDataSet`` loads/saves data from/to HTTP(S) APIs.
     It uses the python requests library: https://requests.readthedocs.io/en/latest/
 
-    Example usage for the
-    `YAML API <https://kedro.readthedocs.io/en/stable/data/\
+    Example usage for the `YAML API <https://kedro.readthedocs.io/en/stable/data/\
     data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
 
     .. code-block:: yaml
@@ -34,10 +35,8 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
             agg_level_desc: STATE,
             year: 2000
 
-    Example usage for the
-    `Python API <https://kedro.readthedocs.io/en/stable/data/\
-    data_catalog.html#use-the-data-catalog-with-the-code-api>`_:
-    ::
+    Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\
+    data_catalog.html#use-the-data-catalog-with-the-code-api>`_: ::
 
         >>> from kedro.extras.datasets.api import APIDataSet
         >>>
@@ -57,49 +56,101 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
         >>>     credentials=("username", "password")
         >>> )
         >>> data = data_set.load()
+
+    ``APIDataSet`` can also be used to save output on a remote server using HTTP(S)
+    methods.
+
+        >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}'
+
+        >>> data_set = APIDataSet(
+                method = "POST"
+                url = "url_of_remote_server",
+                save_args = {"chunk_size":1}
+        )
+        >>> data_set.save(example_table)
+
+    On initialisation, we can specify all the necessary parameters in the save args
+    dictionary. The default HTTP(S) method is POST but PUT is also supported. Two
+    important parameters to keep in mind are timeout and chunk_size. `timeout` defines
+    how long  our program waits for a response after a request. `chunk_size`, is only
+    used if the input of save method is a list. It will divide the request into chunks
+    of size `chunk_size`. For example, here we will send two requests each containing
+    one row of our example DataFrame.
+    If the data passed to the save method is not a list, ``APIDataSet`` will check if it
+    can be loaded as JSON. If true, it will send the data unchanged in a single request.
+    Otherwise, the ``_save`` method will try to dump the data in JSON format and execute
+    the request.
     """
 
+    DEFAULT_SAVE_ARGS = {
+        "params": None,
+        "headers": None,
+        "auth": None,
+        "json": None,
+        "timeout": 60,
+        "chunk_size": 100,
+    }
+    # pylint: disable=too-many-arguments
+
     def __init__(
         self,
         url: str,
         method: str = "GET",
         load_args: Dict[str, Any] = None,
+        save_args: Dict[str, Any] = None,
         credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
     ) -> None:
         """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.
 
         Args:
             url: The API URL endpoint.
-            method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc...
+            method: The method of the request. GET, POST, PUT are the only supported
+                methods
             load_args: Additional parameters to be fed to requests.request.
                 https://requests.readthedocs.io/en/latest/api/#requests.request
             credentials: Allows specifying secrets in credentials.yml.
-                Expected format is ``('login', 'password')`` if given as a tuple or list.
-                An ``AuthBase`` instance can be provided for more complex cases.
+                Expected format is ``('login', 'password')`` if given as a tuple or
+                list. An ``AuthBase`` instance can be provided for more complex cases.
+            save_args: Options for saving data on server. Includes all parameters used
+                during load method. Adds an optional parameter, ``chunk_size`` which
+                determines the size of the package sent at each request.
         Raises:
-            ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified.
+            ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are
+            specified.
         """
         super().__init__()
 
-        self._load_args = load_args or {}
-        self._load_args_auth = self._load_args.pop("auth", None)
+        # GET method means load
+        if method == "GET":
+            self._params = load_args or {}
+
+        # PUT, POST, DELETE means save
+        elif method in ["PUT", "POST"]:
+            self._params = deepcopy(self.DEFAULT_SAVE_ARGS)
+            if save_args is not None:
+                self._params.update(save_args)
+            self._chunk_size = self._params.pop("chunk_size", 1)
+        else:
+            raise ValueError("Only GET, POST and PUT methods are supported")
+
+        self._param_auth = self._params.pop("auth", None)
 
-        if credentials is not None and self._load_args_auth is not None:
+        if credentials is not None and self._param_auth is not None:
             raise ValueError("Cannot specify both auth and credentials.")
 
-        self._auth = credentials or self._load_args_auth
+        self._auth = credentials or self._param_auth
 
-        if "cert" in self._load_args:
-            self._load_args["cert"] = self._convert_type(self._load_args["cert"])
+        if "cert" in self._params:
+            self._params["cert"] = self._convert_type(self._params["cert"])
 
-        if "timeout" in self._load_args:
-            self._load_args["timeout"] = self._convert_type(self._load_args["timeout"])
+        if "timeout" in self._params:
+            self._params["timeout"] = self._convert_type(self._params["timeout"])
 
         self._request_args: Dict[str, Any] = {
             "url": url,
             "method": method,
             "auth": self._convert_type(self._auth),
-            **self._load_args,
+            **self._params,
         }
 
     @staticmethod
@@ -131,11 +182,48 @@ def _execute_request(self, session: Session) -> requests.Response:
         return response
 
     def _load(self) -> requests.Response:
-        with sessions.Session() as session:
-            return self._execute_request(session)
+        if self._request_args["method"] == "GET":
+            with sessions.Session() as session:
+                return self._execute_request(session)
+
+        raise DataSetError("Only GET method is supported for load")
+
+    def _execute_save_with_chunks(
+        self,
+        json_data: List[Dict[str, Any]],
+    ) -> requests.Response:
+        chunk_size = self._chunk_size
+        n_chunks = len(json_data) // chunk_size + 1
+
+        for i in range(n_chunks):
+            send_data = json_data[i * chunk_size : (i + 1) * chunk_size]
+            response = self._execute_save_request(json_data=send_data)
+
+        return response
+
+    def _execute_save_request(self, json_data: Any) -> requests.Response:
+        try:
+            json_.loads(json_data)
+        except TypeError:
+            self._request_args["json"] = json_.dumps(json_data)
+        try:
+            response = requests.request(**self._request_args)
+            response.raise_for_status()
+        except requests.exceptions.HTTPError as exc:
+            raise DataSetError("Failed to send data", exc) from exc
+
+        except OSError as exc:
+            raise DataSetError("Failed to connect to the remote server") from exc
+        return response
+
+    def _save(self, data: Any) -> requests.Response:
+        if self._request_args["method"] in ["PUT", "POST"]:
+            if isinstance(data, list):
+                return self._execute_save_with_chunks(json_data=data)
+
+            return self._execute_save_request(json_data=data)
 
-    def _save(self, data: None) -> NoReturn:
-        raise DataSetError(f"{self.__class__.__name__} is a read only data set type")
+        raise DataSetError("Use PUT or POST methods for save")
 
     def _exists(self) -> bool:
         with sessions.Session() as session:
diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
index a2bb6b1be..91229edcf 100644
--- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
@@ -181,7 +181,6 @@ def _ensure_file_system_target(self) -> None:
             )
 
     def _load(self) -> pd.DataFrame:
-
         self._ensure_file_system_target()
 
         load_path = get_filepath_str(self._get_load_path(), self._protocol)
@@ -196,7 +195,6 @@ def _load(self) -> pd.DataFrame:
         )
 
     def _save(self, data: pd.DataFrame) -> None:
-
         self._ensure_file_system_target()
 
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
index ca3c7643c..c90c5f958 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
@@ -126,7 +126,6 @@ def __init__(
 
         # Update properties in load_args and save_args with credentials.
         if credentials is not None:
-
             # Check credentials for bad inputs.
             for cred_key, cred_value in credentials.items():
                 if cred_value is None:
diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py
index 848020041..c736f90b5 100644
--- a/kedro-datasets/tests/api/test_api_dataset.py
+++ b/kedro-datasets/tests/api/test_api_dataset.py
@@ -1,5 +1,6 @@
 # pylint: disable=no-member
 import base64
+import json
 import socket
 
 import pytest
@@ -10,25 +11,44 @@
 from kedro_datasets.api import APIDataSet
 
 POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
+SAVE_METHODS = ["POST", "PUT"]
 
 TEST_URL = "http://example.com/api/test"
 TEST_TEXT_RESPONSE_DATA = "This is a response."
 TEST_JSON_REQUEST_DATA = [{"key": "value"}]
+TEST_JSON_RESPONSE_DATA = [{"key": "value"}]
 
 TEST_PARAMS = {"param": "value"}
 TEST_URL_WITH_PARAMS = TEST_URL + "?param=value"
 TEST_METHOD = "GET"
 TEST_HEADERS = {"key": "value"}
 
+TEST_SAVE_DATA = [json.dumps({"key1": "info1", "key2": "info2"})]
+
 
 class TestAPIDataSet:
     @pytest.mark.parametrize("method", POSSIBLE_METHODS)
     def test_request_method(self, requests_mock, method):
-        api_data_set = APIDataSet(url=TEST_URL, method=method)
-        requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA)
-
-        response = api_data_set.load()
-        assert response.text == TEST_TEXT_RESPONSE_DATA
+        if method in ["OPTIONS", "HEAD", "PATCH", "DELETE"]:
+            with pytest.raises(
+                ValueError,
+                match="Only GET, POST and PUT methods are supported",
+            ):
+                APIDataSet(url=TEST_URL, method=method)
+
+        else:
+            api_data_set = APIDataSet(url=TEST_URL, method=method)
+
+            requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA)
+
+            if method == "GET":
+                response = api_data_set.load()
+                assert response.text == TEST_TEXT_RESPONSE_DATA
+            else:
+                with pytest.raises(
+                    DataSetError, match="Only GET method is supported for load"
+                ):
+                    api_data_set.load()
 
     @pytest.mark.parametrize(
         "parameters_in, url_postfix",
@@ -181,7 +201,6 @@ def test_certs(self, requests_mock, cert_in, cert_out):
             url=TEST_URL, method=TEST_METHOD, load_args={"cert": cert_in}
         )
         requests_mock.register_uri(TEST_METHOD, TEST_URL)
-
         response = api_data_set.load()
         assert response.request.cert == cert_out
 
@@ -252,10 +271,107 @@ def test_socket_error(self, requests_mock):
         with pytest.raises(DataSetError, match="Failed to connect"):
             api_data_set.load()
 
-    def test_read_only_mode(self):
+    @pytest.mark.parametrize("method", POSSIBLE_METHODS)
+    def test_successful_save(self, requests_mock, method):
+        """
+        When we want to save some data on a server
+        Given an APIDataSet class
+        Then check we get a response
+        """
+        if method in ["PUT", "POST"]:
+            api_data_set = APIDataSet(
+                url=TEST_URL,
+                method=method,
+                save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
+            )
+            requests_mock.register_uri(
+                method,
+                TEST_URL_WITH_PARAMS,
+                headers=TEST_HEADERS,
+                status_code=requests.codes.ok,
+            )
+            response = api_data_set._save(TEST_SAVE_DATA)
+
+            assert isinstance(response, requests.Response)
+        elif method == "GET":
+            api_data_set = APIDataSet(
+                url=TEST_URL,
+                method=method,
+                save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
+            )
+            with pytest.raises(DataSetError, match="Use PUT or POST methods for save"):
+                api_data_set._save(TEST_SAVE_DATA)
+        else:
+            with pytest.raises(
+                ValueError,
+                match="Only GET, POST and PUT methods are supported",
+            ):
+                APIDataSet(url=TEST_URL, method=method)
+
+    @pytest.mark.parametrize("save_methods", SAVE_METHODS)
+    def test_successful_save_with_json(self, requests_mock, save_methods):
         """
-        Saving is disabled on the data set.
+        When we want to save with json parameters
+        Given an APIDataSet class
+        Then check we get a response
         """
-        api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD)
-        with pytest.raises(DataSetError, match="is a read only data set type"):
-            api_data_set.save({})
+        api_data_set = APIDataSet(
+            url=TEST_URL,
+            method=save_methods,
+            save_args={"json": TEST_JSON_RESPONSE_DATA, "headers": TEST_HEADERS},
+        )
+        requests_mock.register_uri(
+            save_methods,
+            TEST_URL,
+            headers=TEST_HEADERS,
+            text=json.dumps(TEST_JSON_RESPONSE_DATA),
+        )
+        response_list = api_data_set._save(TEST_SAVE_DATA)
+
+        assert isinstance(response_list, requests.Response)
+
+        response_dict = api_data_set._save({"item1": "key1"})
+        assert isinstance(response_dict, requests.Response)
+
+        response_json = api_data_set._save(TEST_SAVE_DATA[0])
+        assert isinstance(response_json, requests.Response)
+
+    @pytest.mark.parametrize("save_methods", SAVE_METHODS)
+    def test_save_http_error(self, requests_mock, save_methods):
+        api_data_set = APIDataSet(
+            url=TEST_URL,
+            method=save_methods,
+            save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS, "chunk_size": 2},
+        )
+        requests_mock.register_uri(
+            save_methods,
+            TEST_URL_WITH_PARAMS,
+            headers=TEST_HEADERS,
+            text="Nope, not found",
+            status_code=requests.codes.FORBIDDEN,
+        )
+
+        with pytest.raises(DataSetError, match="Failed to send data"):
+            api_data_set.save(TEST_SAVE_DATA)
+
+        with pytest.raises(DataSetError, match="Failed to send data"):
+            api_data_set.save(TEST_SAVE_DATA[0])
+
+    @pytest.mark.parametrize("save_methods", SAVE_METHODS)
+    def test_save_socket_error(self, requests_mock, save_methods):
+        api_data_set = APIDataSet(
+            url=TEST_URL,
+            method=save_methods,
+            save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
+        )
+        requests_mock.register_uri(save_methods, TEST_URL_WITH_PARAMS, exc=socket.error)
+
+        with pytest.raises(
+            DataSetError, match="Failed to connect to the remote server"
+        ):
+            api_data_set.save(TEST_SAVE_DATA)
+
+        with pytest.raises(
+            DataSetError, match="Failed to connect to the remote server"
+        ):
+            api_data_set.save(TEST_SAVE_DATA[0])

From 36de4b9953f5ddd83153ff6f86a7decd901d6f1f Mon Sep 17 00:00:00 2001
From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com>
Date: Mon, 22 May 2023 12:31:24 +0100
Subject: [PATCH 87/96] ci: Automatically extract release notes for GitHub
 Releases (#212)

* ci: Automatically extract release notes

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* fix lint

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Raise exceptions

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Lint

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

* Lint

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>

---------

Signed-off-by: Ankita Katiyar <ankitakatiyar2401@gmail.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .github/workflows/check-release.yml           | 29 +++++++-------
 tools/github_actions/extract_release_notes.py | 39 +++++++++++++++++++
 2 files changed, 52 insertions(+), 16 deletions(-)
 create mode 100644 tools/github_actions/extract_release_notes.py

diff --git a/.github/workflows/check-release.yml b/.github/workflows/check-release.yml
index 916cf70f7..51036d260 100644
--- a/.github/workflows/check-release.yml
+++ b/.github/workflows/check-release.yml
@@ -56,24 +56,21 @@ jobs:
       run: |
         export plugin=${{ needs.check-version.outputs.package_name }}
         make package
+    - name: Extract release notes from ${{needs.check-version.outputs.package_name}}/RELEASE.md
+      id: extract
+      run: |
+        python tools/github_actions/extract_release_notes.py \
+        "${{needs.check-version.outputs.package_name}}/RELEASE.md" \
+        "Release ${{needs.check-version.outputs.package_version}}"
     - name: Create GitHub Release
-      uses: actions/github-script@v6
+      uses: softprops/action-gh-release@v1
       with:
-        github-token: ${{ secrets.GH_TAGGING_TOKEN }}
-        script: |
-          const package_name = "${{ needs.check-version.outputs.package_name }}"
-          const package_version = "${{ needs.check-version.outputs.package_version }}"
-          const response = await github.rest.repos.createRelease({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            tag_name: `${package_name}-${package_version}`,
-            target_commitish: 'main',
-            name: `${package_name}-${package_version}`,
-            body: `Release ${package_version}`,
-            draft: false,
-            prerelease: false,
-          });
-          return response.data;
+        tag_name: ${{needs.check-version.outputs.package_name}}-${{needs.check-version.outputs.package_version}}
+        name: ${{needs.check-version.outputs.package_name}}-${{needs.check-version.outputs.package_version}}
+        body_path: release_body.txt
+        draft: false
+        prerelease: false
+        token: ${{ secrets.GH_TAGGING_TOKEN }}
     - name: Set PyPI token
       run: |
         if [ "${{ needs.check-version.outputs.PACKAGE_NAME }}" == "kedro-airflow" ]; then
diff --git a/tools/github_actions/extract_release_notes.py b/tools/github_actions/extract_release_notes.py
new file mode 100644
index 000000000..52a8516cb
--- /dev/null
+++ b/tools/github_actions/extract_release_notes.py
@@ -0,0 +1,39 @@
+import sys
+
+
+def extract_section(filename, heading):
+    with open(filename, 'r') as file:
+        lines = file.readlines()
+
+    start_line, end_line = None, None
+
+    for i, line in enumerate(lines):
+        if line.startswith('# '):
+            current_heading = line.strip('#').replace(':', '').strip()
+            if current_heading == heading:
+                start_line = i
+            elif start_line is not None:
+                end_line = i
+                break
+
+    if start_line is not None:
+        if end_line is None:
+            end_line = len(lines)
+        section_lines = lines[start_line + 1:end_line]
+        section = ''.join(section_lines).strip()
+        return section
+    else:
+        return None
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        raise Exception("Usage: python extract_release_notes.py <filename> <heading>")
+
+    filename = sys.argv[1]
+    heading = sys.argv[2]
+    section = extract_section(filename, heading)
+    if not section:
+        raise Exception(f"Section not found under the {heading} heading")
+    with open("release_body.txt", "w") as text_file:
+        text_file.write(section)

From 870e623f4530b6c9428efdefd781fbc258a965d3 Mon Sep 17 00:00:00 2001
From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com>
Date: Mon, 22 May 2023 15:43:07 +0100
Subject: [PATCH 88/96] feat: Add metadata attribute to datasets (#189)

* Add metadata attribute to all datasets

Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/RELEASE.md                     |  2 +-
 .../kedro_datasets/api/api_dataset.py         | 18 +++++++------
 .../biosequence/biosequence_dataset.py        |  9 ++++++-
 .../kedro_datasets/dask/parquet_dataset.py    |  5 ++++
 .../kedro_datasets/email/message_dataset.py   |  5 ++++
 .../geopandas/geojson_dataset.py              |  5 ++++
 .../holoviews/holoviews_writer.py             |  5 ++++
 .../kedro_datasets/json/json_dataset.py       |  5 ++++
 .../matplotlib/matplotlib_writer.py           |  5 ++++
 .../kedro_datasets/networkx/gml_dataset.py    |  5 ++++
 .../networkx/graphml_dataset.py               |  5 ++++
 .../kedro_datasets/networkx/json_dataset.py   |  5 ++++
 .../kedro_datasets/pandas/csv_dataset.py      |  5 ++++
 .../kedro_datasets/pandas/excel_dataset.py    |  5 ++++
 .../kedro_datasets/pandas/feather_dataset.py  |  5 ++++
 .../kedro_datasets/pandas/gbq_dataset.py      | 18 +++++++++++--
 .../kedro_datasets/pandas/generic_dataset.py  |  9 ++++++-
 .../kedro_datasets/pandas/hdf_dataset.py      |  9 ++++++-
 .../kedro_datasets/pandas/json_dataset.py     |  5 ++++
 .../kedro_datasets/pandas/parquet_dataset.py  |  5 ++++
 .../kedro_datasets/pandas/sql_dataset.py      | 11 ++++++++
 .../kedro_datasets/pandas/xml_dataset.py      |  5 ++++
 .../kedro_datasets/pickle/pickle_dataset.py   |  9 ++++++-
 .../kedro_datasets/pillow/image_dataset.py    |  5 ++++
 .../kedro_datasets/plotly/json_dataset.py     |  5 ++++
 .../kedro_datasets/plotly/plotly_dataset.py   |  5 ++++
 .../kedro_datasets/polars/csv_dataset.py      |  5 ++++
 .../kedro_datasets/redis/redis_dataset.py     |  9 ++++++-
 .../snowflake/snowpark_dataset.py             |  9 ++++++-
 .../spark/deltatable_dataset.py               |  7 +++--
 .../kedro_datasets/spark/spark_dataset.py     |  6 ++++-
 .../spark/spark_hive_dataset.py               |  5 ++++
 .../spark/spark_jdbc_dataset.py               |  5 ++++
 .../svmlight/svmlight_dataset.py              | 26 +++++++++++++++++++
 .../tensorflow/tensorflow_model_dataset.py    |  6 +++++
 .../kedro_datasets/text/text_dataset.py       |  6 +++++
 .../kedro_datasets/video/video_dataset.py     | 11 +++++++-
 .../kedro_datasets/yaml/yaml_dataset.py       |  5 ++++
 38 files changed, 254 insertions(+), 21 deletions(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index fd2a755ca..76d730159 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -4,9 +4,9 @@
 * Added pandas 2.0 support.
 * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
 * Added a save method to the APIDataSet
-
 * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
 * Relaxed Kedro version pin to `>=0.16`
+* Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins.
 
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
index b1b93a7eb..ad2a6c367 100644
--- a/kedro-datasets/kedro_datasets/api/api_dataset.py
+++ b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -10,10 +10,6 @@
 from requests import Session, sessions
 from requests.auth import AuthBase
 
-# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0.
-# Any contribution to datasets should be made in kedro-datasets
-# in kedro-plugins (https://github.com/kedro-org/kedro-plugins)
-
 
 class APIDataSet(AbstractDataSet[None, requests.Response]):
     """``APIDataSet`` loads/saves data from/to HTTP(S) APIs.
@@ -38,7 +34,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
     Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\
     data_catalog.html#use-the-data-catalog-with-the-code-api>`_: ::
 
-        >>> from kedro.extras.datasets.api import APIDataSet
+        >>> from kedro_datasets.api import APIDataSet
         >>>
         >>>
         >>> data_set = APIDataSet(
@@ -99,6 +95,7 @@ def __init__(
         load_args: Dict[str, Any] = None,
         save_args: Dict[str, Any] = None,
         credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.
 
@@ -108,12 +105,15 @@ def __init__(
                 methods
             load_args: Additional parameters to be fed to requests.request.
                 https://requests.readthedocs.io/en/latest/api/#requests.request
-            credentials: Allows specifying secrets in credentials.yml.
-                Expected format is ``('login', 'password')`` if given as a tuple or
-                list. An ``AuthBase`` instance can be provided for more complex cases.
             save_args: Options for saving data on server. Includes all parameters used
                 during load method. Adds an optional parameter, ``chunk_size`` which
                 determines the size of the package sent at each request.
+            credentials: Allows specifying secrets in credentials.yml.
+                Expected format is ``('login', 'password')`` if given as a tuple or list.
+                An ``AuthBase`` instance can be provided for more complex cases.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
+
         Raises:
             ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are
             specified.
@@ -153,6 +153,8 @@ def __init__(
             **self._params,
         }
 
+        self.metadata = metadata
+
     @staticmethod
     def _convert_type(value: Any):
         """
diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py
index e9dd924a6..ed683da48 100644
--- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py
+++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py
@@ -10,7 +10,9 @@
 from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path
 
 
-class BioSequenceDataSet(AbstractDataSet[List, List]):
+class BioSequenceDataSet(
+    AbstractDataSet[List, List]
+):  # pylint:disable=too-many-instance-attributes
     r"""``BioSequenceDataSet`` loads and saves data to a sequence file.
 
     Example:
@@ -47,6 +49,7 @@ def __init__(
         save_args: Dict[str, Any] = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """
         Creates a new instance of ``BioSequenceDataSet`` pointing
@@ -69,6 +72,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO
         """
@@ -100,6 +105,8 @@ def __init__(
         self._fs_open_args_load = _fs_open_args_load
         self._fs_open_args_save = _fs_open_args_save
 
+        self.metadata = metadata
+
     def _describe(self) -> Dict[str, Any]:
         return {
             "filepath": self._filepath,
diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py
index f3c00e265..76344b7f5 100644
--- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py
+++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py
@@ -94,6 +94,7 @@ def __init__(
         save_args: Dict[str, Any] = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``ParquetDataSet`` pointing to concrete
         parquet files.
@@ -109,11 +110,15 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Optional parameters to the backend file system driver:
                 https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         self._filepath = filepath
         self._fs_args = deepcopy(fs_args) or {}
         self._credentials = deepcopy(credentials) or {}
 
+        self.metadata = metadata
+
         # Handle default load and save arguments
         self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
         if load_args is not None:
diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py
index 2faf3bb5d..e94735aac 100644
--- a/kedro-datasets/kedro_datasets/email/message_dataset.py
+++ b/kedro-datasets/kedro_datasets/email/message_dataset.py
@@ -64,6 +64,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``EmailMessageDataSet`` pointing to a concrete text file
         on a specific filesystem.
@@ -103,6 +104,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -116,6 +119,8 @@ def __init__(
             _fs_args.setdefault("auto_mkdir", True)
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
index 75a9f8357..4596b2b82 100644
--- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
+++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
@@ -56,6 +56,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``GeoJSONDataSet`` pointing to a concrete GeoJSON file
         on a specific filesystem fsspec.
@@ -85,6 +86,8 @@ def __init__(
                 Here you can find all available arguments for `open`:
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `wb` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = copy.deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -97,6 +100,8 @@ def __init__(
 
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
index 9a17dbe7b..df38739e9 100644
--- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
+++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
@@ -47,6 +47,7 @@ def __init__(
         credentials: Dict[str, Any] = None,
         save_args: Dict[str, Any] = None,
         version: Version = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``HoloviewsWriter``.
 
@@ -70,6 +71,8 @@ def __init__(
                 ``kedro.io.core.Version``. If its ``load`` attribute is
                 None, the latest version will be loaded. If its ``save``
                 attribute is None, save version will be autogenerated.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _credentials = deepcopy(credentials) or {}
         _fs_args = deepcopy(fs_args) or {}
@@ -83,6 +86,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py
index 53239ece3..c2df700b3 100644
--- a/kedro-datasets/kedro_datasets/json/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/json/json_dataset.py
@@ -59,6 +59,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
         on a specific filesystem.
@@ -86,6 +87,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -99,6 +102,8 @@ def __init__(
             _fs_args.setdefault("auto_mkdir", True)
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py
index d7aaf6a02..a0c9a049b 100644
--- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py
+++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py
@@ -115,6 +115,7 @@ def __init__(
         save_args: Dict[str, Any] = None,
         version: Version = None,
         overwrite: bool = False,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``MatplotlibWriter``.
 
@@ -140,6 +141,8 @@ def __init__(
             overwrite: If True, any existing image files will be removed.
                 Only relevant when saving multiple Matplotlib objects at
                 once.
+            metadata: Any arbitrary Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _credentials = deepcopy(credentials) or {}
         _fs_args = deepcopy(fs_args) or {}
@@ -153,6 +156,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py
index 4dd88cb22..25111e639 100644
--- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py
+++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py
@@ -48,6 +48,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``GMLDataSet``.
 
@@ -73,6 +74,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -86,6 +89,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
index ca12b6bae..c538498a6 100644
--- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
+++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
@@ -47,6 +47,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``GraphMLDataSet``.
 
@@ -72,6 +73,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -85,6 +88,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py
index 3fdf9f253..8ac0e35a3 100644
--- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py
@@ -48,6 +48,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``JSONDataSet``.
 
@@ -73,6 +74,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -86,6 +89,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
index 336aff406..52ba0c7e6 100644
--- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py
@@ -77,6 +77,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file
         on a specific filesystem.
@@ -102,6 +103,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _credentials = deepcopy(credentials) or {}
@@ -114,6 +117,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
index 45aee3192..0eceb759d 100644
--- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py
@@ -118,6 +118,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``ExcelDataSet`` pointing to a concrete Excel file
         on a specific filesystem.
@@ -150,6 +151,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: If versioning is enabled while in append mode.
@@ -165,6 +168,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py
index be261d42a..0ca8e1cd8 100644
--- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py
@@ -77,6 +77,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``FeatherDataSet`` pointing to a concrete
         filepath.
@@ -102,6 +103,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _credentials = deepcopy(credentials) or {}
@@ -114,6 +117,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
index ebfadf249..a8001d2ae 100644
--- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py
@@ -20,7 +20,9 @@
 )
 
 
-class GBQTableDataSet(AbstractDataSet[None, pd.DataFrame]):
+class GBQTableDataSet(
+    AbstractDataSet[None, pd.DataFrame]
+):  # pylint:disable=too-many-instance-attributes
     """``GBQTableDataSet`` loads and saves data from/to Google BigQuery.
     It uses pandas-gbq to read and write from/to BigQuery table.
 
@@ -74,6 +76,7 @@ def __init__(
         credentials: Union[Dict[str, Any], Credentials] = None,
         load_args: Dict[str, Any] = None,
         save_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``GBQTableDataSet``.
 
@@ -96,6 +99,8 @@ def __init__(
                 Here you can find all available arguments:
                 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html
                 All defaults are preserved, but "progress_bar", which is set to False.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: When ``load_args['location']`` and ``save_args['location']``
@@ -125,6 +130,8 @@ def __init__(
             location=self._save_args.get("location"),
         )
 
+        self.metadata = metadata
+
     def _describe(self) -> Dict[str, Any]:
         return {
             "dataset": self._dataset,
@@ -171,7 +178,9 @@ def _validate_location(self):
             )
 
 
-class GBQQueryDataSet(AbstractDataSet[None, pd.DataFrame]):
+class GBQQueryDataSet(
+    AbstractDataSet[None, pd.DataFrame]
+):  # pylint:disable=too-many-instance-attributes
     """``GBQQueryDataSet`` loads data from a provided SQL query from Google
     BigQuery. It uses ``pandas.read_gbq`` which itself uses ``pandas-gbq``
     internally to read from BigQuery table. Therefore it supports all allowed
@@ -214,6 +223,7 @@ def __init__(
         load_args: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
         filepath: str = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``GBQQueryDataSet``.
 
@@ -235,6 +245,8 @@ def __init__(
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) used for reading the
                 SQL query from filepath.
             filepath: A path to a file with a sql query statement.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: When ``sql`` and ``filepath`` parameters are either both empty
@@ -283,6 +295,8 @@ def __init__(
             self._fs = fsspec.filesystem(self._protocol, **_fs_credentials, **_fs_args)
             self._filepath = path
 
+        self.metadata = metadata
+
     def _describe(self) -> Dict[str, Any]:
         load_args = copy.deepcopy(self._load_args)
         desc = {}
diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
index 91229edcf..9388876d7 100644
--- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py
@@ -28,7 +28,9 @@
 ]
 
 
-class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]):
+class GenericDataSet(
+    AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]
+):  # pylint:disable=too-many-instance-attributes
     """`pandas.GenericDataSet` loads/saves data from/to a data file using an underlying
     filesystem (e.g.: local, S3, GCS). It uses pandas to dynamically select the
     appropriate type of read/write target on a best effort basis.
@@ -94,6 +96,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ):
         """Creates a new instance of ``GenericDataSet`` pointing to a concrete data file
         on a specific filesystem. The appropriate pandas load/save methods are
@@ -134,6 +137,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: Will be raised if at least less than one appropriate
@@ -154,6 +159,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py
index b821f17da..0632ad612 100644
--- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py
@@ -19,7 +19,9 @@
 HDFSTORE_DRIVER = "H5FD_CORE"
 
 
-class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]):
+class HDFDataSet(
+    AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]
+):  # pylint:disable=too-many-instance-attributes
     """``HDFDataSet`` loads/saves data from/to a hdf file using an underlying
     filesystem (e.g. local, S3, GCS). It uses pandas.HDFStore to handle the hdf file.
 
@@ -69,6 +71,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``HDFDataSet`` pointing to a concrete hdf file
         on a specific filesystem.
@@ -100,6 +103,8 @@ def __init__(
                 Here you can find all available arguments for `open`:
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set `wb` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -113,6 +118,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py
index 76d1cca0a..f0777ec21 100644
--- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py
@@ -72,6 +72,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
         on a specific filesystem.
@@ -97,6 +98,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{'token': None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _credentials = deepcopy(credentials) or {}
@@ -108,6 +111,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py
index b41d468c3..537abe9b0 100644
--- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py
@@ -83,6 +83,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``ParquetDataSet`` pointing to a concrete Parquet file
         on a specific filesystem.
@@ -111,6 +112,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _credentials = deepcopy(credentials) or {}
@@ -123,6 +126,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py
index 029dc6939..a94a36743 100644
--- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py
@@ -156,12 +156,14 @@ class SQLTableDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]):
     # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine
     engines: Dict[str, Any] = {}
 
+    # pylint: disable=too-many-arguments
     def __init__(
         self,
         table_name: str,
         credentials: Dict[str, Any],
         load_args: Dict[str, Any] = None,
         save_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new ``SQLTableDataSet``.
 
@@ -188,6 +190,8 @@ def __init__(
                 To find all supported connection string formats, see here:
                 https://docs.sqlalchemy.org/core/engines.html#database-urls
                 It has ``index=False`` in the default parameters.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: When either ``table_name`` or ``con`` is empty.
@@ -216,6 +220,8 @@ def __init__(
         self._connection_str = credentials["con"]
         self.create_connection(self._connection_str)
 
+        self.metadata = metadata
+
     @classmethod
     def create_connection(cls, connection_str: str) -> None:
         """Given a connection string, create singleton connection
@@ -380,6 +386,7 @@ def __init__(  # pylint: disable=too-many-arguments
         fs_args: Dict[str, Any] = None,
         filepath: str = None,
         execution_options: Optional[Dict[str, Any]] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new ``SQLQueryDataSet``.
 
@@ -411,6 +418,8 @@ def __init__(  # pylint: disable=too-many-arguments
                 https://docs.sqlalchemy.org/core/connections.html#sqlalchemy.engine.Connection.execution_options
                 Note that this is not a standard argument supported by pandas API, but could be
                 useful for handling large datasets.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: When either ``sql`` or ``con`` parameters is empty.
@@ -441,6 +450,8 @@ def __init__(  # pylint: disable=too-many-arguments
             else default_load_args
         )
 
+        self.metadata = metadata
+
         # load sql query from file
         if sql:
             self._load_args["sql"] = sql
diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py
index 59f96e441..5a73a1536 100644
--- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py
+++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py
@@ -55,6 +55,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``XMLDataSet`` pointing to a concrete XML file
         on a specific filesystem.
@@ -80,6 +81,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _credentials = deepcopy(credentials) or {}
@@ -92,6 +95,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py
index 11ee512c1..f381e39d4 100644
--- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py
+++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py
@@ -18,7 +18,9 @@
 )
 
 
-class PickleDataSet(AbstractVersionedDataSet[Any, Any]):
+class PickleDataSet(
+    AbstractVersionedDataSet[Any, Any]
+):  # pylint:disable=too-many-instance-attributes
     """``PickleDataSet`` loads/saves data from/to a Pickle file using an underlying
     filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by
     the specified backend library passed in (defaults to the ``pickle`` library), so it
@@ -81,6 +83,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``PickleDataSet`` pointing to a concrete Pickle
         file on a specific filesystem. ``PickleDataSet`` supports custom backends to
@@ -132,6 +135,8 @@ def __init__(
                 Here you can find all available arguments for `open`:
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `wb` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             ValueError: If ``backend`` does not satisfy the `pickle` interface.
@@ -170,6 +175,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py
index 6dd94635e..aaf74fb1b 100644
--- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py
+++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py
@@ -43,6 +43,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``ImageDataSet`` pointing to a concrete image file
         on a specific filesystem.
@@ -70,6 +71,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -83,6 +86,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py
index f99fe8ac4..5a29a06e0 100644
--- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py
+++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py
@@ -61,6 +61,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
         on a specific filesystem.
@@ -92,6 +93,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `w` when
                 saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -105,6 +108,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py
index 1bb0acef6..38638a3d8 100644
--- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py
+++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py
@@ -75,6 +75,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``PlotlyDataSet`` pointing to a concrete JSON file
         on a specific filesystem.
@@ -109,6 +110,8 @@ def __init__(
                 Here you can find all available arguments for `open`:
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         super().__init__(filepath, load_args, save_args, version, credentials, fs_args)
         self._plotly_args = plotly_args
@@ -121,6 +124,8 @@ def __init__(
         self._fs_open_args_load = _fs_open_args_load
         self._fs_open_args_save = _fs_open_args_save
 
+        self.metadata = metadata
+
     def _describe(self) -> Dict[str, Any]:
         return {**super()._describe(), "plotly_args": self._plotly_args}
 
diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py
index 6bbc721c4..fa2332bfa 100644
--- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py
+++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py
@@ -75,6 +75,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file
         on a specific filesystem.
@@ -103,6 +104,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _credentials = deepcopy(credentials) or {}
@@ -115,6 +118,8 @@ def __init__(
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py
index ce5aa741f..f292ca986 100644
--- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py
+++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py
@@ -11,7 +11,9 @@
 from kedro.io.core import AbstractDataSet, DataSetError
 
 
-class PickleDataSet(AbstractDataSet[Any, Any]):
+class PickleDataSet(
+    AbstractDataSet[Any, Any]
+):  # pylint:disable=too-many-instance-attributes
     """``PickleDataSet`` loads/saves data from/to a Redis database. The
     underlying functionality is supported by the redis library, so it supports
     all allowed options for instantiating the redis app ``from_url`` and setting
@@ -68,6 +70,7 @@ def __init__(
         save_args: Dict[str, Any] = None,
         credentials: Dict[str, Any] = None,
         redis_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``PickleDataSet``. This loads/saves data from/to
         a Redis database while deserialising/serialising. Supports custom backends to
@@ -109,6 +112,8 @@ def __init__(
                 https://redis-py.readthedocs.io/en/stable/connections.html?highlight=from_url#redis.Redis.from_url
                 All defaults are preserved, except `url`, which is set to `redis://127.0.0.1:6379`.
                 You could also specify the url through the env variable ``REDIS_URL``.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             ValueError: If ``backend`` does not satisfy the `pickle` interface.
@@ -134,6 +139,8 @@ def __init__(
 
         self._key = key
 
+        self.metadata = metadata
+
         _redis_args = deepcopy(redis_args) or {}
         self._redis_from_url_args = _redis_args.pop("from_url_args", {})
         self._redis_from_url_args.setdefault("url", self.DEFAULT_REDIS_URL)
diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py
index e1adc50c0..9cebbf12f 100644
--- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py
+++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py
@@ -10,7 +10,9 @@
 logger = logging.getLogger(__name__)
 
 
-class SnowparkTableDataSet(AbstractDataSet):
+class SnowparkTableDataSet(
+    AbstractDataSet
+):  # pylint:disable=too-many-instance-attributes
     """``SnowparkTableDataSet`` loads and saves Snowpark dataframes.
 
     As of Mar-2023, the snowpark connector only works with Python 3.8.
@@ -108,6 +110,7 @@ def __init__(  # pylint: disable=too-many-arguments
         load_args: Dict[str, Any] = None,
         save_args: Dict[str, Any] = None,
         credentials: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``SnowparkTableDataSet``.
 
@@ -128,6 +131,8 @@ def __init__(  # pylint: disable=too-many-arguments
             credentials: A dictionary with a snowpark connection string.
                 To find all supported arguments, see here:
                 https://docs.snowflake.com/en/user-guide/python-connector-api.html#connect
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
 
         if not table_name:
@@ -168,6 +173,8 @@ def __init__(  # pylint: disable=too-many-arguments
         self._connection_parameters = connection_parameters
         self._session = self._get_session(self._connection_parameters)
 
+        self.metadata = metadata
+
     def _describe(self) -> Dict[str, Any]:
         return {
             "table_name": self._table_name,
diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
index 34ee6f6a5..4290a2cfb 100644
--- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py
@@ -2,7 +2,7 @@
 ``delta-spark``
 """
 from pathlib import PurePosixPath
-from typing import NoReturn
+from typing import Any, Dict, NoReturn
 
 from delta.tables import DeltaTable
 from kedro.io.core import AbstractDataSet, DataSetError
@@ -62,7 +62,7 @@ class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]):
     # using ``ThreadRunner`` instead
     _SINGLE_PROCESS = True
 
-    def __init__(self, filepath: str) -> None:
+    def __init__(self, filepath: str, metadata: Dict[str, Any] = None) -> None:
         """Creates a new instance of ``DeltaTableDataSet``.
 
         Args:
@@ -70,11 +70,14 @@ def __init__(self, filepath: str) -> None:
                 and working with data written to mount path points,
                 specify ``filepath``s for (versioned) ``SparkDataSet``s
                 starting with ``/dbfs/mnt``.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         fs_prefix, filepath = _split_filepath(filepath)
 
         self._fs_prefix = fs_prefix
         self._filepath = PurePosixPath(filepath)
+        self.metadata = metadata
 
     @staticmethod
     def _get_spark():
diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py
index a0d099350..f2da7573e 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py
@@ -236,7 +236,7 @@ class SparkDataSet(AbstractVersionedDataSet[DataFrame, DataFrame]):
     DEFAULT_LOAD_ARGS: Dict[str, Any] = {}
     DEFAULT_SAVE_ARGS: Dict[str, Any] = {}
 
-    def __init__(  # pylint: disable=too-many-arguments
+    def __init__(  # pylint: disable=too-many-arguments disable=too-many-locals
         self,
         filepath: str,
         file_format: str = "parquet",
@@ -244,6 +244,7 @@ def __init__(  # pylint: disable=too-many-arguments
         save_args: Dict[str, Any] = None,
         version: Version = None,
         credentials: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``SparkDataSet``.
 
@@ -275,12 +276,15 @@ def __init__(  # pylint: disable=too-many-arguments
                 ``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.
                 Optional keyword arguments passed to ``hdfs.client.InsecureClient``
                 if ``filepath`` prefix is ``hdfs://``. Ignored otherwise.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         credentials = deepcopy(credentials) or {}
         fs_prefix, filepath = _split_filepath(filepath)
         path = PurePosixPath(filepath)
         exists_function = None
         glob_function = None
+        self.metadata = metadata
 
         if not filepath.startswith("/dbfs/") and _deployed_on_databricks():
             logger.warning(
diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py
index 3ea2fb0a1..75ae4cdf8 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py
@@ -73,6 +73,7 @@ def __init__(
         write_mode: str = "errorifexists",
         table_pk: List[str] = None,
         save_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``SparkHiveDataSet``.
 
@@ -88,6 +89,8 @@ def __init__(
                 on a list of column names.
                 Other `HiveOptions` can be found here:
                 https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#specifying-storage-format-for-hive-tables
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Note:
             For users leveraging the `upsert` functionality,
@@ -119,6 +122,8 @@ def __init__(
         self._format = self._save_args.pop("format", None) or "hive"
         self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True
 
+        self.metadata = metadata
+
     def _describe(self) -> Dict[str, Any]:
         return {
             "database": self._database,
diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
index c90c5f958..2ac96e544 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py
@@ -74,6 +74,7 @@ def __init__(
         credentials: Dict[str, Any] = None,
         load_args: Dict[str, Any] = None,
         save_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new ``SparkJDBCDataSet``.
 
@@ -93,6 +94,8 @@ def __init__(
                 with the JDBC URL and the name of the table. To find all
                 supported arguments, see here:
                 https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.jdbc.html
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
             DataSetError: When either ``url`` or ``table`` is empty or
@@ -116,6 +119,8 @@ def __init__(
         self._url = url
         self._table = table
 
+        self.metadata = metadata
+
         # Handle default load and save arguments
         self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
         if load_args is not None:
diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py
index c08555aa1..cc26dd141 100644
--- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py
+++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py
@@ -98,7 +98,31 @@ def __init__(
         version: Optional[Version] = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
+        """Creates a new instance of SVMLightDataSet to load/save data from a svmlight/libsvm file.
+
+        Args:
+            filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.
+                If prefix is not provided, `file` protocol (local filesystem) will be used.
+                The prefix should be any protocol supported by ``fsspec``.
+            load_args: Arguments passed on to ``load_svmlight_file``.
+                See the details in
+                https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html
+            save_args: Arguments passed on to ``dump_svmlight_file``.
+                See the details in
+                https://scikit-learn.org/stable/modules/generated/sklearn.datasets.dump_svmlight_file.html
+            version: If specified, should be an instance of
+                ``kedro.io.core.Version``. If its ``load`` attribute is
+                None, the latest version will be loaded. If its ``save``
+                attribute is None, save version will be autogenerated.
+            credentials: Credentials required to get access to the underlying filesystem.
+                E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
+            fs_args: Extra arguments to pass into underlying filesystem class constructor
+                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
+        """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
         _fs_open_args_save = _fs_args.pop("open_args_save", {})
@@ -111,6 +135,8 @@ def __init__(
             _fs_args.setdefault("auto_mkdir", True)
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py
index 42b550737..a4ce65887 100644
--- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py
+++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py
@@ -72,6 +72,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``TensorFlowModelDataSet``.
 
@@ -96,6 +97,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{'token': None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = copy.deepcopy(fs_args) or {}
         _credentials = copy.deepcopy(credentials) or {}
@@ -105,6 +108,9 @@ def __init__(
 
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
+
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py
index 0bb559e29..40697bc13 100644
--- a/kedro-datasets/kedro_datasets/text/text_dataset.py
+++ b/kedro-datasets/kedro_datasets/text/text_dataset.py
@@ -45,12 +45,14 @@ class TextDataSet(AbstractVersionedDataSet[str, str]):
 
     """
 
+    # pylint: disable=too-many-arguments
     def __init__(
         self,
         filepath: str,
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``TextDataSet`` pointing to a concrete text file
         on a specific filesystem.
@@ -74,6 +76,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -87,6 +91,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,
diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py
index 03311146d..37239037f 100644
--- a/kedro-datasets/kedro_datasets/video/video_dataset.py
+++ b/kedro-datasets/kedro_datasets/video/video_dataset.py
@@ -126,7 +126,10 @@ class SequenceVideo(AbstractVideo):
     """A video object read from an indexable sequence of frames"""
 
     def __init__(
-        self, frames: Sequence[PIL.Image.Image], fps: float, fourcc: str = "mp4v"
+        self,
+        frames: Sequence[PIL.Image.Image],
+        fps: float,
+        fourcc: str = "mp4v",
     ) -> None:
         self._n_frames = len(frames)
         self._frames = frames
@@ -155,6 +158,7 @@ def __getitem__(self, index: Union[int, slice]):
 class GeneratorVideo(AbstractVideo):
     """A video object with frames yielded by a generator"""
 
+    # pylint: disable=too-many-arguments
     def __init__(
         self,
         frames: Generator[PIL.Image.Image, None, None],
@@ -258,12 +262,14 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]):
 
     """
 
+    # pylint: disable=too-many-arguments
     def __init__(
         self,
         filepath: str,
         fourcc: Optional[str] = "mp4v",
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of VideoDataSet to load / save video data for given filepath.
 
@@ -276,6 +282,8 @@ def __init__(
                 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         # parse the path and protocol (e.g. file, http, s3, etc.)
         protocol, path = get_protocol_and_path(filepath)
@@ -286,6 +294,7 @@ def __init__(
         _credentials = deepcopy(credentials) or {}
         self._storage_options = {**_credentials, **_fs_args}
         self._fs = fsspec.filesystem(self._protocol, **self._storage_options)
+        self.metadata = metadata
 
     def _load(self) -> AbstractVideo:
         """Loads data from the video file.
diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py
index a576f439a..5ab0fd3dc 100644
--- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py
+++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py
@@ -56,6 +56,7 @@ def __init__(
         version: Version = None,
         credentials: Dict[str, Any] = None,
         fs_args: Dict[str, Any] = None,
+        metadata: Dict[str, Any] = None,
     ) -> None:
         """Creates a new instance of ``YAMLDataSet`` pointing to a concrete YAML file
         on a specific filesystem.
@@ -83,6 +84,8 @@ def __init__(
                 https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
                 All defaults are preserved, except `mode`, which is set to `r` when loading
                 and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
         """
         _fs_args = deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
@@ -96,6 +99,8 @@ def __init__(
         self._protocol = protocol
         self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
 
+        self.metadata = metadata
+
         super().__init__(
             filepath=PurePosixPath(path),
             version=version,

From 9d66cc8b5bc6b3943707763c7477a3fbafe40976 Mon Sep 17 00:00:00 2001
From: Jannic <37243923+jmholzer@users.noreply.github.com>
Date: Mon, 22 May 2023 16:47:35 +0100
Subject: [PATCH 89/96] feat: Add ManagedTableDataset for managed Delta Lake
 tables in Databricks (#206)

* committing first version of UnityTableCatalog with unit tests. This datasets allows users to interface with Unity catalog tables in Databricks to both read and write.

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* renaming dataset

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* adding mlflow connectors

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* fixing mlflow imports

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* cleaned up mlflow for initial release

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* cleaned up mlflow references from setup.py for initial release

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* fixed deps in setup.py

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* adding comments before intiial PR

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* moved validation to dataclass

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* bug fix in type of partition column and cleanup

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* updated docstring for ManagedTableDataSet

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* added backticks to catalog

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* fixing regex to allow hyphens

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/test_requirements.txt

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* adding backticks to catalog

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Require pandas < 2.0 for compatibility with spark < 3.4

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Replace use of walrus operator

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Add test coverage for validation methods

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Remove unused versioning functions

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Fix exception catching for invalid schema, add test for invalid schema

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Add pylint ignore

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Add tests/databricks to ignore for no-spark tests

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Nok Lam Chan <mediumnok@gmail.com>

* Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

Co-authored-by: Nok Lam Chan <mediumnok@gmail.com>

* Remove spurious mlflow test dependency

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Add explicit check for database existence

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Remove character limit for table names

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Refactor validation steps in ManagedTable

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Remove spurious checks for table and schema name existence

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

---------

Signed-off-by: Danny Farah <danny_farah@mckinsey.com>
Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>
Co-authored-by: Danny Farah <danny.farah@quantumblack.com>
Co-authored-by: Danny Farah <danny_farah@mckinsey.com>
Co-authored-by: Nok Lam Chan <mediumnok@gmail.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 Makefile                                      |   4 +-
 kedro-datasets/.gitignore                     |   3 +
 .../kedro_datasets/databricks/__init__.py     |   8 +
 .../databricks/managed_table_dataset.py       | 432 ++++++++++++++++
 kedro-datasets/setup.py                       |   8 +-
 kedro-datasets/test_requirements.txt          |   2 +-
 kedro-datasets/tests/databricks/__init__.py   |   0
 kedro-datasets/tests/databricks/conftest.py   |  25 +
 .../databricks/test_managed_table_dataset.py  | 484 ++++++++++++++++++
 9 files changed, 962 insertions(+), 4 deletions(-)
 create mode 100644 kedro-datasets/kedro_datasets/databricks/__init__.py
 create mode 100644 kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py
 create mode 100644 kedro-datasets/tests/databricks/__init__.py
 create mode 100644 kedro-datasets/tests/databricks/conftest.py
 create mode 100644 kedro-datasets/tests/databricks/test_managed_table_dataset.py

diff --git a/Makefile b/Makefile
index be653ed59..4e0b4e640 100644
--- a/Makefile
+++ b/Makefile
@@ -52,10 +52,10 @@ sign-off:
 
 # kedro-datasets related only
 test-no-spark:
-	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --numprocesses 4 --dist loadfile
+	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile
 
 test-no-spark-sequential:
-	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark
+	cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks
 
 # kedro-datasets/snowflake tests skipped from default scope
 test-snowflake-only:
diff --git a/kedro-datasets/.gitignore b/kedro-datasets/.gitignore
index d20ee9733..721e13f70 100644
--- a/kedro-datasets/.gitignore
+++ b/kedro-datasets/.gitignore
@@ -145,3 +145,6 @@ kedro.db
 kedro/html
 docs/tmp-build-artifacts
 docs/build
+spark-warehouse
+metastore_db/
+derby.log
diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py
new file mode 100644
index 000000000..d416ac291
--- /dev/null
+++ b/kedro-datasets/kedro_datasets/databricks/__init__.py
@@ -0,0 +1,8 @@
+"""Provides interface to Unity Catalog Tables."""
+
+__all__ = ["ManagedTableDataSet"]
+
+from contextlib import suppress
+
+with suppress(ImportError):
+    from .managed_table_dataset import ManagedTableDataSet
diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py
new file mode 100644
index 000000000..01ec15a6f
--- /dev/null
+++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py
@@ -0,0 +1,432 @@
+"""``ManagedTableDataSet`` implementation to access managed delta tables
+in Databricks.
+"""
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import pandas as pd
+from kedro.io.core import (
+    AbstractVersionedDataSet,
+    DataSetError,
+    Version,
+    VersionNotFoundError,
+)
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.types import StructType
+from pyspark.sql.utils import AnalysisException, ParseException
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ManagedTable:  # pylint: disable=too-many-instance-attributes
+    """Stores the definition of a managed table"""
+
+    # regex for tables, catalogs and schemas
+    _NAMING_REGEX = r"\b[0-9a-zA-Z_-]{1,}\b"
+    _VALID_WRITE_MODES = ["overwrite", "upsert", "append"]
+    _VALID_DATAFRAME_TYPES = ["spark", "pandas"]
+    database: str
+    catalog: Optional[str]
+    table: str
+    write_mode: str
+    dataframe_type: str
+    primary_key: Optional[str]
+    owner_group: str
+    partition_columns: Union[str, List[str]]
+    json_schema: StructType
+
+    def __post_init__(self):
+        """Run validation methods if declared.
+        The validation method can be a simple check
+        that raises DataSetError.
+        The validation is performed by calling a function named:
+            `validate_<field_name>(self, value) -> raises DataSetError`
+        """
+        for name in self.__dataclass_fields__.keys():  # pylint: disable=no-member
+            method = getattr(self, f"_validate_{name}", None)
+            if method:
+                method()
+
+    def _validate_table(self):
+        """Validates table name
+
+        Raises:
+            DataSetError: If the table name does not conform to naming constraints.
+        """
+        if not re.fullmatch(self._NAMING_REGEX, self.table):
+            raise DataSetError("table does not conform to naming")
+
+    def _validate_database(self):
+        """Validates database name
+
+        Raises:
+            DataSetError: If the dataset name does not conform to naming constraints.
+        """
+        if not re.fullmatch(self._NAMING_REGEX, self.database):
+            raise DataSetError("database does not conform to naming")
+
+    def _validate_catalog(self):
+        """Validates catalog name
+
+        Raises:
+            DataSetError: If the catalog name does not conform to naming constraints.
+        """
+        if self.catalog:
+            if not re.fullmatch(self._NAMING_REGEX, self.catalog):
+                raise DataSetError("catalog does not conform to naming")
+
+    def _validate_write_mode(self):
+        """Validates the write mode
+
+        Raises:
+            DataSetError: If an invalid `write_mode` is passed.
+        """
+        if self.write_mode not in self._VALID_WRITE_MODES:
+            valid_modes = ", ".join(self._VALID_WRITE_MODES)
+            raise DataSetError(
+                f"Invalid `write_mode` provided: {self.write_mode}. "
+                f"`write_mode` must be one of: {valid_modes}"
+            )
+
+    def _validate_dataframe_type(self):
+        """Validates the dataframe type
+
+        Raises:
+            DataSetError: If an invalid `dataframe_type` is passed
+        """
+        if self.dataframe_type not in self._VALID_DATAFRAME_TYPES:
+            valid_types = ", ".join(self._VALID_DATAFRAME_TYPES)
+            raise DataSetError(f"`dataframe_type` must be one of {valid_types}")
+
+    def _validate_primary_key(self):
+        """Validates the primary key of the table
+
+        Raises:
+            DataSetError: If no `primary_key` is specified.
+        """
+        if self.primary_key is None or len(self.primary_key) == 0:
+            if self.write_mode == "upsert":
+                raise DataSetError(
+                    f"`primary_key` must be provided for"
+                    f"`write_mode` {self.write_mode}"
+                )
+
+    def full_table_location(self) -> str:
+        """Returns the full table location
+
+        Returns:
+            str: table location in the format catalog.database.table
+        """
+        full_table_location = None
+        if self.catalog and self.database and self.table:
+            full_table_location = f"`{self.catalog}`.`{self.database}`.`{self.table}`"
+        elif self.database and self.table:
+            full_table_location = f"`{self.database}`.`{self.table}`"
+        return full_table_location
+
+    def schema(self) -> StructType:
+        """Returns the Spark schema of the table if it exists
+
+        Returns:
+            StructType:
+        """
+        schema = None
+        try:
+            if self.json_schema is not None:
+                schema = StructType.fromJson(self.json_schema)
+        except (KeyError, ValueError) as exc:
+            raise DataSetError(exc) from exc
+        return schema
+
+
+class ManagedTableDataSet(AbstractVersionedDataSet):
+    """``ManagedTableDataSet`` loads and saves data into managed delta tables on Databricks.
+        Load and save can be in Spark or Pandas dataframes, specified in dataframe_type.
+        When saving data, you can specify one of three modes: overwrite(default), append,
+        or upsert. Upsert requires you to specify the primary_column parameter which
+        will be used as part of the join condition. This dataset works best with
+        the databricks kedro starter. That starter comes with hooks that allow this
+        dataset to function properly. Follow the instructions in that starter to
+        setup your project for this dataset.
+
+        Example usage for the
+        `YAML API <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+        .. code-block:: yaml
+
+            names_and_ages@spark:
+              type: databricks.ManagedTableDataSet
+              table: names_and_ages
+
+            names_and_ages@pandas:
+              type: databricks.ManagedTableDataSet
+              table: names_and_ages
+              dataframe_type: pandas
+
+        Example usage for the
+        `Python API <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-code-api>`_:
+        .. code-block:: python
+
+            from pyspark.sql import SparkSession
+            from pyspark.sql.types import (StructField, StringType,
+                                            IntegerType, StructType)
+            from kedro_datasets.databricks import ManagedTableDataSet
+            schema = StructType([StructField("name", StringType(), True),
+                                 StructField("age", IntegerType(), True)])
+            data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)]
+            spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
+            data_set = ManagedTableDataSet(table="names_and_ages")
+            data_set.save(spark_df)
+            reloaded = data_set.load()
+            reloaded.take(4)
+        """
+
+    # this dataset cannot be used with ``ParallelRunner``,
+    # therefore it has the attribute ``_SINGLE_PROCESS = True``
+    # for parallelism within a Spark pipeline please consider
+    # using ``ThreadRunner`` instead
+    _SINGLE_PROCESS = True
+
+    def __init__(  # pylint: disable=R0913
+        self,
+        table: str,
+        catalog: str = None,
+        database: str = "default",
+        write_mode: str = "overwrite",
+        dataframe_type: str = "spark",
+        primary_key: Optional[Union[str, List[str]]] = None,
+        version: Version = None,
+        *,
+        # the following parameters are used by project hooks
+        # to create or update table properties
+        schema: Dict[str, Any] = None,
+        partition_columns: List[str] = None,
+        owner_group: str = None,
+    ) -> None:
+        """Creates a new instance of ``ManagedTableDataSet``
+
+        Args:
+            table (str): the name of the table
+            catalog (str, optional): the name of the catalog in Unity.
+             Defaults to None.
+            database (str, optional): the name of the database.
+             (also referred to as schema). Defaults to "default".
+            write_mode (str, optional): the mode to write the data into the table.
+             Options are:["overwrite", "append", "upsert"].
+             "upsert" mode requires primary_key field to be populated.
+             Defaults to "overwrite".
+            dataframe_type (str, optional): "pandas" or "spark" dataframe.
+             Defaults to "spark".
+            primary_key (Union[str, List[str]], optional): the primary key of the table.
+             Can be in the form of a list. Defaults to None.
+            version (Version, optional): kedro.io.core.Version instance to load the data.
+             Defaults to None.
+            schema (Dict[str, Any], optional): the schema of the table in JSON form.
+             Dataframes will be truncated to match the schema if provided.
+             Used by the hooks to create the table if the schema is provided
+             Defaults to None.
+            partition_columns (List[str], optional): the columns to use for partitioning the table.
+             Used by the hooks. Defaults to None.
+            owner_group (str, optional): if table access control is enabled in your workspace,
+             specifying owner_group will transfer ownership of the table and database to
+             this owner. All databases should have the same owner_group. Defaults to None.
+        Raises:
+            DataSetError: Invalid configuration supplied (through ManagedTable validation)
+        """
+
+        self._table = ManagedTable(
+            database=database,
+            catalog=catalog,
+            table=table,
+            write_mode=write_mode,
+            dataframe_type=dataframe_type,
+            primary_key=primary_key,
+            owner_group=owner_group,
+            partition_columns=partition_columns,
+            json_schema=schema,
+        )
+
+        self._version = version
+
+        super().__init__(
+            filepath=None,
+            version=version,
+            exists_function=self._exists,
+        )
+
+    @staticmethod
+    def _get_spark() -> SparkSession:
+        return SparkSession.builder.getOrCreate()
+
+    def _load(self) -> Union[DataFrame, pd.DataFrame]:
+        """Loads the version of data in the format defined in the init
+        (spark|pandas dataframe)
+
+        Raises:
+            VersionNotFoundError: if the version defined in
+            the init doesn't exist
+
+        Returns:
+            Union[DataFrame, pd.DataFrame]: Returns a dataframe
+            in the format defined in the init
+        """
+        if self._version and self._version.load >= 0:
+            try:
+                data = (
+                    self._get_spark()
+                    .read.format("delta")
+                    .option("versionAsOf", self._version.load)
+                    .table(self._table.full_table_location())
+                )
+            except Exception as exc:
+                raise VersionNotFoundError(self._version.load) from exc
+        else:
+            data = self._get_spark().table(self._table.full_table_location())
+        if self._table.dataframe_type == "pandas":
+            data = data.toPandas()
+        return data
+
+    def _save_append(self, data: DataFrame) -> None:
+        """Saves the data to the table by appending it
+        to the location defined in the init
+
+        Args:
+            data (DataFrame): the Spark dataframe to append to the table
+        """
+        data.write.format("delta").mode("append").saveAsTable(
+            self._table.full_table_location()
+        )
+
+    def _save_overwrite(self, data: DataFrame) -> None:
+        """Overwrites the data in the table with the data provided.
+        (this is the default save mode)
+
+        Args:
+            data (DataFrame): the Spark dataframe to overwrite the table with.
+        """
+        delta_table = data.write.format("delta")
+        if self._table.write_mode == "overwrite":
+            delta_table = delta_table.mode("overwrite").option(
+                "overwriteSchema", "true"
+            )
+        delta_table.saveAsTable(self._table.full_table_location())
+
+    def _save_upsert(self, update_data: DataFrame) -> None:
+        """Upserts the data by joining on primary_key columns or column.
+        If table doesn't exist at save, the data is inserted to a new table.
+
+        Args:
+            update_data (DataFrame): the Spark dataframe to upsert
+        """
+        if self._exists():
+            base_data = self._get_spark().table(self._table.full_table_location())
+            base_columns = base_data.columns
+            update_columns = update_data.columns
+
+            if set(update_columns) != set(base_columns):
+                raise DataSetError(
+                    f"Upsert requires tables to have identical columns. "
+                    f"Delta table {self._table.full_table_location()} "
+                    f"has columns: {base_columns}, whereas "
+                    f"dataframe has columns {update_columns}"
+                )
+
+            where_expr = ""
+            if isinstance(self._table.primary_key, str):
+                where_expr = (
+                    f"base.{self._table.primary_key}=update.{self._table.primary_key}"
+                )
+            elif isinstance(self._table.primary_key, list):
+                where_expr = " AND ".join(
+                    f"base.{col}=update.{col}" for col in self._table.primary_key
+                )
+
+            update_data.createOrReplaceTempView("update")
+            self._get_spark().conf.set(
+                "fullTableAddress", self._table.full_table_location()
+            )
+            self._get_spark().conf.set("whereExpr", where_expr)
+            upsert_sql = """MERGE INTO ${fullTableAddress} base USING update ON ${whereExpr}
+                WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT *"""
+            self._get_spark().sql(upsert_sql)
+        else:
+            self._save_append(update_data)
+
+    def _save(self, data: Union[DataFrame, pd.DataFrame]) -> None:
+        """Saves the data based on the write_mode and dataframe_type in the init.
+        If write_mode is pandas, Spark dataframe is created first.
+        If schema is provided, data is matched to schema before saving
+        (columns will be sorted and truncated).
+
+        Args:
+            data (Any): Spark or pandas dataframe to save to the table location
+        """
+        # filter columns specified in schema and match their ordering
+        if self._table.schema():
+            cols = self._table.schema().fieldNames()
+            if self._table.dataframe_type == "pandas":
+                data = self._get_spark().createDataFrame(
+                    data.loc[:, cols], schema=self._table.schema()
+                )
+            else:
+                data = data.select(*cols)
+        else:
+            if self._table.dataframe_type == "pandas":
+                data = self._get_spark().createDataFrame(data)
+        if self._table.write_mode == "overwrite":
+            self._save_overwrite(data)
+        elif self._table.write_mode == "upsert":
+            self._save_upsert(data)
+        elif self._table.write_mode == "append":
+            self._save_append(data)
+
+    def _describe(self) -> Dict[str, str]:
+        """Returns a description of the instance of ManagedTableDataSet
+
+        Returns:
+            Dict[str, str]: Dict with the details of the dataset
+        """
+        return {
+            "catalog": self._table.catalog,
+            "database": self._table.database,
+            "table": self._table.table,
+            "write_mode": self._table.write_mode,
+            "dataframe_type": self._table.dataframe_type,
+            "primary_key": self._table.primary_key,
+            "version": str(self._version),
+            "owner_group": self._table.owner_group,
+            "partition_columns": self._table.partition_columns,
+        }
+
+    def _exists(self) -> bool:
+        """Checks to see if the table exists
+
+        Returns:
+            bool: boolean of whether the table defined
+            in the dataset instance exists in the Spark session
+        """
+        if self._table.catalog:
+            try:
+                self._get_spark().sql(f"USE CATALOG `{self._table.catalog}`")
+            except (ParseException, AnalysisException) as exc:
+                logger.warning(
+                    "catalog %s not found or unity not enabled. Error message: %s",
+                    self._table.catalog,
+                    exc,
+                )
+        try:
+            return (
+                self._get_spark()
+                .sql(f"SHOW TABLES IN `{self._table.database}`")
+                .filter(f"tableName = '{self._table.table}'")
+                .count()
+                > 0
+            )
+        except (ParseException, AnalysisException) as exc:
+            logger.warning("error occured while trying to find table: %s", exc)
+            return False
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index be99f9912..a154d8132 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -8,6 +8,7 @@
 HDFS = "hdfs>=2.5.8, <3.0"
 S3FS = "s3fs>=0.3.0, <0.5"
 POLARS = "polars~=0.17.0"
+DELTA = "delta-spark~=1.2.1"
 
 
 def _collect_requirements(requires):
@@ -16,7 +17,10 @@ def _collect_requirements(requires):
 
 api_require = {"api.APIDataSet": ["requests~=2.20"]}
 biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]}
-dask_require = {"dask.ParquetDataSet": ["dask[complete]", "triad>=0.6.7, <1.0"]}
+dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]}
+databricks_require = {
+    "databricks.ManagedTableDataSet": [SPARK, PANDAS, DELTA]
+}
 geopandas_require = {
     "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"]
 }
@@ -79,6 +83,7 @@ def _collect_requirements(requires):
     "api": _collect_requirements(api_require),
     "biosequence": _collect_requirements(biosequence_require),
     "dask": _collect_requirements(dask_require),
+    "databricks": _collect_requirements(databricks_require),
     "docs": [
         "docutils==0.16",
         "sphinx~=3.4.3",
@@ -108,6 +113,7 @@ def _collect_requirements(requires):
     **api_require,
     **biosequence_require,
     **dask_require,
+    **databricks_require,
     **geopandas_require,
     **holoviews_require,
     **matplotlib_require,
diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt
index 4d4954739..fe20fee5f 100644
--- a/kedro-datasets/test_requirements.txt
+++ b/kedro-datasets/test_requirements.txt
@@ -30,7 +30,7 @@ networkx~=2.4
 opencv-python~=4.5.5.64
 openpyxl>=3.0.3, <4.0
 pandas-gbq>=0.12.0, <0.18.0
-pandas>=1.3  # 1.3 for read_xml/to_xml
+pandas>=1.3, <2 # 1.3 for read_xml/to_xml, <2 for compatibility with Spark < 3.4
 Pillow~=9.0
 plotly>=4.8.0, <6.0
 polars~=0.15.13
diff --git a/kedro-datasets/tests/databricks/__init__.py b/kedro-datasets/tests/databricks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/kedro-datasets/tests/databricks/conftest.py b/kedro-datasets/tests/databricks/conftest.py
new file mode 100644
index 000000000..26d63b056
--- /dev/null
+++ b/kedro-datasets/tests/databricks/conftest.py
@@ -0,0 +1,25 @@
+"""
+This file contains the fixtures that are reusable by any tests within
+this directory. You don't need to import the fixtures as pytest will
+discover them automatically. More info here:
+https://docs.pytest.org/en/latest/fixture.html
+"""
+import pytest
+from pyspark.sql import SparkSession
+
+
+@pytest.fixture(scope="class", autouse=True)
+def spark_session():
+    spark = (
+        SparkSession.builder.appName("test")
+        .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1")
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config(
+            "spark.sql.catalog.spark_catalog",
+            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
+        )
+        .getOrCreate()
+    )
+    spark.sql("create database if not exists test")
+    yield spark
+    spark.sql("drop database test cascade;")
diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py
new file mode 100644
index 000000000..9aae08707
--- /dev/null
+++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py
@@ -0,0 +1,484 @@
+import pandas as pd
+import pytest
+from kedro.io.core import DataSetError, Version, VersionNotFoundError
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+from kedro_datasets.databricks import ManagedTableDataSet
+
+
+@pytest.fixture
+def sample_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def upsert_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [("Alex", 32), ("Evan", 23)]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def mismatched_upsert_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+            StructField("height", IntegerType(), True),
+        ]
+    )
+
+    data = [("Alex", 32, 174), ("Evan", 23, 166)]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def subset_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+            StructField("height", IntegerType(), True),
+        ]
+    )
+
+    data = [("Alex", 32, 174), ("Evan", 23, 166)]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def subset_pandas_df():
+    return pd.DataFrame(
+        {"name": ["Alex", "Evan"], "age": [32, 23], "height": [174, 166]}
+    )
+
+
+@pytest.fixture
+def subset_expected_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [("Alex", 32), ("Evan", 23)]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def sample_pandas_df():
+    return pd.DataFrame(
+        {"name": ["Alex", "Bob", "Clarke", "Dave"], "age": [31, 12, 65, 29]}
+    )
+
+
+@pytest.fixture
+def append_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [("Evan", 23), ("Frank", 13)]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def expected_append_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [
+        ("Alex", 31),
+        ("Bob", 12),
+        ("Clarke", 65),
+        ("Dave", 29),
+        ("Evan", 23),
+        ("Frank", 13),
+    ]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def expected_upsert_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [
+        ("Alex", 32),
+        ("Bob", 12),
+        ("Clarke", 65),
+        ("Dave", 29),
+        ("Evan", 23),
+    ]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+@pytest.fixture
+def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession):
+    schema = StructType(
+        [
+            StructField("name", StringType(), True),
+            StructField("age", IntegerType(), True),
+        ]
+    )
+
+    data = [
+        ("Alex", 31),
+        ("Alex", 32),
+        ("Bob", 12),
+        ("Clarke", 65),
+        ("Dave", 29),
+        ("Evan", 23),
+    ]
+
+    return spark_session.createDataFrame(data, schema)
+
+
+# pylint: disable=too-many-public-methods
+class TestManagedTableDataSet:
+    def test_full_table(self):
+        unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test")
+        assert unity_ds._table.full_table_location() == "`test`.`test`.`test`"
+
+        unity_ds = ManagedTableDataSet(
+            catalog="test-test", database="test", table="test"
+        )
+        assert unity_ds._table.full_table_location() == "`test-test`.`test`.`test`"
+
+        unity_ds = ManagedTableDataSet(database="test", table="test")
+        assert unity_ds._table.full_table_location() == "`test`.`test`"
+
+        unity_ds = ManagedTableDataSet(table="test")
+        assert unity_ds._table.full_table_location() == "`default`.`test`"
+
+        with pytest.raises(TypeError):
+            ManagedTableDataSet()  # pylint: disable=no-value-for-parameter
+
+    def test_describe(self):
+        unity_ds = ManagedTableDataSet(table="test")
+        assert unity_ds._describe() == {
+            "catalog": None,
+            "database": "default",
+            "table": "test",
+            "write_mode": "overwrite",
+            "dataframe_type": "spark",
+            "primary_key": None,
+            "version": "None",
+            "owner_group": None,
+            "partition_columns": None,
+        }
+
+    def test_invalid_write_mode(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(table="test", write_mode="invalid")
+
+    def test_dataframe_type(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(table="test", dataframe_type="invalid")
+
+    def test_missing_primary_key_upsert(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(table="test", write_mode="upsert")
+
+    def test_invalid_table_name(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(table="invalid!")
+
+    def test_invalid_database(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(table="test", database="invalid!")
+
+    def test_invalid_catalog(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(table="test", catalog="invalid!")
+
+    def test_schema(self):
+        unity_ds = ManagedTableDataSet(
+            table="test",
+            schema={
+                "fields": [
+                    {
+                        "metadata": {},
+                        "name": "name",
+                        "nullable": True,
+                        "type": "string",
+                    },
+                    {
+                        "metadata": {},
+                        "name": "age",
+                        "nullable": True,
+                        "type": "integer",
+                    },
+                ],
+                "type": "struct",
+            },
+        )
+        expected_schema = StructType(
+            [
+                StructField("name", StringType(), True),
+                StructField("age", IntegerType(), True),
+            ]
+        )
+        assert unity_ds._table.schema() == expected_schema
+
+    def test_invalid_schema(self):
+        with pytest.raises(DataSetError):
+            ManagedTableDataSet(
+                table="test",
+                schema={
+                    "fields": [
+                        {
+                            "invalid": "schema",
+                        }
+                    ],
+                    "type": "struct",
+                },
+            )._table.schema()
+
+    def test_catalog_exists(self):
+        unity_ds = ManagedTableDataSet(
+            catalog="test", database="invalid", table="test_not_there"
+        )
+        assert not unity_ds._exists()
+
+    def test_table_does_not_exist(self):
+        unity_ds = ManagedTableDataSet(database="invalid", table="test_not_there")
+        assert not unity_ds._exists()
+
+    def test_save_default(self, sample_spark_df: DataFrame):
+        unity_ds = ManagedTableDataSet(database="test", table="test_save")
+        unity_ds.save(sample_spark_df)
+        saved_table = unity_ds.load()
+        assert (
+            unity_ds._exists() and sample_spark_df.exceptAll(saved_table).count() == 0
+        )
+
+    def test_save_schema_spark(
+        self, subset_spark_df: DataFrame, subset_expected_df: DataFrame
+    ):
+        unity_ds = ManagedTableDataSet(
+            database="test",
+            table="test_save_spark_schema",
+            schema={
+                "fields": [
+                    {
+                        "metadata": {},
+                        "name": "name",
+                        "nullable": True,
+                        "type": "string",
+                    },
+                    {
+                        "metadata": {},
+                        "name": "age",
+                        "nullable": True,
+                        "type": "integer",
+                    },
+                ],
+                "type": "struct",
+            },
+        )
+        unity_ds.save(subset_spark_df)
+        saved_table = unity_ds.load()
+        assert subset_expected_df.exceptAll(saved_table).count() == 0
+
+    def test_save_schema_pandas(
+        self, subset_pandas_df: pd.DataFrame, subset_expected_df: DataFrame
+    ):
+        unity_ds = ManagedTableDataSet(
+            database="test",
+            table="test_save_pd_schema",
+            schema={
+                "fields": [
+                    {
+                        "metadata": {},
+                        "name": "name",
+                        "nullable": True,
+                        "type": "string",
+                    },
+                    {
+                        "metadata": {},
+                        "name": "age",
+                        "nullable": True,
+                        "type": "integer",
+                    },
+                ],
+                "type": "struct",
+            },
+            dataframe_type="pandas",
+        )
+        unity_ds.save(subset_pandas_df)
+        saved_ds = ManagedTableDataSet(
+            database="test",
+            table="test_save_pd_schema",
+        )
+        saved_table = saved_ds.load()
+        assert subset_expected_df.exceptAll(saved_table).count() == 0
+
+    def test_save_overwrite(
+        self, sample_spark_df: DataFrame, append_spark_df: DataFrame
+    ):
+        unity_ds = ManagedTableDataSet(database="test", table="test_save")
+        unity_ds.save(sample_spark_df)
+        unity_ds.save(append_spark_df)
+
+        overwritten_table = unity_ds.load()
+
+        assert append_spark_df.exceptAll(overwritten_table).count() == 0
+
+    def test_save_append(
+        self,
+        sample_spark_df: DataFrame,
+        append_spark_df: DataFrame,
+        expected_append_spark_df: DataFrame,
+    ):
+        unity_ds = ManagedTableDataSet(
+            database="test", table="test_save_append", write_mode="append"
+        )
+        unity_ds.save(sample_spark_df)
+        unity_ds.save(append_spark_df)
+
+        appended_table = unity_ds.load()
+
+        assert expected_append_spark_df.exceptAll(appended_table).count() == 0
+
+    def test_save_upsert(
+        self,
+        sample_spark_df: DataFrame,
+        upsert_spark_df: DataFrame,
+        expected_upsert_spark_df: DataFrame,
+    ):
+        unity_ds = ManagedTableDataSet(
+            database="test",
+            table="test_save_upsert",
+            write_mode="upsert",
+            primary_key="name",
+        )
+        unity_ds.save(sample_spark_df)
+        unity_ds.save(upsert_spark_df)
+
+        upserted_table = unity_ds.load()
+
+        assert expected_upsert_spark_df.exceptAll(upserted_table).count() == 0
+
+    def test_save_upsert_multiple_primary(
+        self,
+        sample_spark_df: DataFrame,
+        upsert_spark_df: DataFrame,
+        expected_upsert_multiple_primary_spark_df: DataFrame,
+    ):
+        unity_ds = ManagedTableDataSet(
+            database="test",
+            table="test_save_upsert_multiple",
+            write_mode="upsert",
+            primary_key=["name", "age"],
+        )
+        unity_ds.save(sample_spark_df)
+        unity_ds.save(upsert_spark_df)
+
+        upserted_table = unity_ds.load()
+
+        assert (
+            expected_upsert_multiple_primary_spark_df.exceptAll(upserted_table).count()
+            == 0
+        )
+
+    def test_save_upsert_mismatched_columns(
+        self,
+        sample_spark_df: DataFrame,
+        mismatched_upsert_spark_df: DataFrame,
+    ):
+        unity_ds = ManagedTableDataSet(
+            database="test",
+            table="test_save_upsert_mismatch",
+            write_mode="upsert",
+            primary_key="name",
+        )
+        unity_ds.save(sample_spark_df)
+        with pytest.raises(DataSetError):
+            unity_ds.save(mismatched_upsert_spark_df)
+
+    def test_load_spark(self, sample_spark_df: DataFrame):
+        unity_ds = ManagedTableDataSet(database="test", table="test_load_spark")
+        unity_ds.save(sample_spark_df)
+
+        delta_ds = ManagedTableDataSet(database="test", table="test_load_spark")
+        delta_table = delta_ds.load()
+
+        assert (
+            isinstance(delta_table, DataFrame)
+            and delta_table.exceptAll(sample_spark_df).count() == 0
+        )
+
+    def test_load_spark_no_version(self, sample_spark_df: DataFrame):
+        unity_ds = ManagedTableDataSet(database="test", table="test_load_spark")
+        unity_ds.save(sample_spark_df)
+
+        delta_ds = ManagedTableDataSet(
+            database="test", table="test_load_spark", version=Version(2, None)
+        )
+        with pytest.raises(VersionNotFoundError):
+            _ = delta_ds.load()
+
+    def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFrame):
+        unity_ds = ManagedTableDataSet(
+            database="test", table="test_load_version", write_mode="append"
+        )
+        unity_ds.save(sample_spark_df)
+        unity_ds.save(append_spark_df)
+
+        loaded_ds = ManagedTableDataSet(
+            database="test", table="test_load_version", version=Version(0, None)
+        )
+        loaded_df = loaded_ds.load()
+
+        assert loaded_df.exceptAll(sample_spark_df).count() == 0
+
+    def test_load_pandas(self, sample_pandas_df: pd.DataFrame):
+        unity_ds = ManagedTableDataSet(
+            database="test", table="test_load_pandas", dataframe_type="pandas"
+        )
+        unity_ds.save(sample_pandas_df)
+
+        pandas_ds = ManagedTableDataSet(
+            database="test", table="test_load_pandas", dataframe_type="pandas"
+        )
+        pandas_df = pandas_ds.load().sort_values("name", ignore_index=True)
+
+        assert isinstance(pandas_df, pd.DataFrame) and pandas_df.equals(
+            sample_pandas_df
+        )

From 0aaa922b39d2ff7177c05a0e8ccbaba346d6a579 Mon Sep 17 00:00:00 2001
From: Nok Lam Chan <mediumnok@gmail.com>
Date: Mon, 22 May 2023 17:49:58 +0100
Subject: [PATCH 90/96] docs: Update APIDataset docs and refactor (#217)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update APIDataset docs and refactor

* Acknowledge community contributor

* Fix more broken doc

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>

* Lint

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Fix release notes of upcoming kedro-datasets

---------

Signed-off-by: Nok Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Co-authored-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/RELEASE.md                        |  9 ++-------
 kedro-datasets/kedro_datasets/api/api_dataset.py | 12 ++++++------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index 76d730159..3ff1dbf6f 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -10,18 +10,13 @@
 
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
+* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets.
 
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:
 
 * [BrianCechmanek](https://github.com/BrianCechmanek)
-
-# Release 1.2.1:
-
-## Major features and improvements:
-
-## Bug fixes and other changes
-* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets.
+* [McDonnellJoseph](https://github.com/McDonnellJoseph)
 
 # Release 1.2.0:
 
diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
index ad2a6c367..82bba3546 100644
--- a/kedro-datasets/kedro_datasets/api/api_dataset.py
+++ b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -59,7 +59,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
         >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}'
 
         >>> data_set = APIDataSet(
-                method = "POST"
+                method = "POST",
                 url = "url_of_remote_server",
                 save_args = {"chunk_size":1}
         )
@@ -109,14 +109,14 @@ def __init__(
                 during load method. Adds an optional parameter, ``chunk_size`` which
                 determines the size of the package sent at each request.
             credentials: Allows specifying secrets in credentials.yml.
-                Expected format is ``('login', 'password')`` if given as a tuple or list.
-                An ``AuthBase`` instance can be provided for more complex cases.
+                Expected format is ``('login', 'password')`` if given as a tuple or
+                list. An ``AuthBase`` instance can be provided for more complex cases.
             metadata: Any arbitrary metadata.
                 This is ignored by Kedro, but may be consumed by users or external plugins.
 
         Raises:
-            ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are
-            specified.
+            ValueError: if both ``auth`` and ``credentials`` are specified or used
+            unsupported RESTful API method.
         """
         super().__init__()
 
@@ -124,7 +124,7 @@ def __init__(
         if method == "GET":
             self._params = load_args or {}
 
-        # PUT, POST, DELETE means save
+        # PUT, POST means save
         elif method in ["PUT", "POST"]:
             self._params = deepcopy(self.DEFAULT_SAVE_ARGS)
             if save_args is not None:

From ccec03bb9fbae1ab5d91353b8ac5de5d87de1013 Mon Sep 17 00:00:00 2001
From: Jannic <37243923+jmholzer@users.noreply.github.com>
Date: Mon, 22 May 2023 21:23:08 +0100
Subject: [PATCH 91/96] feat: Release `kedro-datasets` version `1.3.0` (#219)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Modify release version and RELEASE.md

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Add proper name for ManagedTableDataSet

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

* Update kedro-datasets/RELEASE.md

Co-authored-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Revert lost semicolon for release 1.2.0

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>

---------

Signed-off-by: Jannic Holzer <jannic.holzer@quantumblack.com>
Co-authored-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/RELEASE.md                 | 17 ++++++++++++++---
 kedro-datasets/kedro_datasets/__init__.py |  2 +-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index 3ff1dbf6f..ed347ca60 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -1,15 +1,25 @@
 # Upcoming Release:
 
-## Major features and improvements:
+## Major features and improvements
+
+## Bug fixes and other changes
+
+## Community contributions
+
+# Release 1.3.0:
+
+## Major features and improvements
 * Added pandas 2.0 support.
 * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
 * Added a save method to the APIDataSet
 * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
 * Relaxed Kedro version pin to `>=0.16`
 * Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins.
+* Added `ManagedTableDataSet` for managed delta tables on Databricks.
 
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
+* Upgraded required `polars` version to 0.17.
 * Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets.
 
 ## Community contributions
@@ -17,10 +27,11 @@ Many thanks to the following Kedroids for contributing PRs to this release:
 
 * [BrianCechmanek](https://github.com/BrianCechmanek)
 * [McDonnellJoseph](https://github.com/McDonnellJoseph)
+* [Danny Farah](https://github.com/dannyrfar)
 
 # Release 1.2.0:
 
-## Major features and improvements:
+## Major features and improvements
 * Added `fsspec` resolution in `SparkDataSet` to support more filesystems.
 * Added the `_preview` method to the Pandas `ExcelDataSet` and `CSVDataSet` classes.
 
@@ -35,7 +46,7 @@ Many thanks to the following Kedroids for contributing PRs to this release:
 
 # Release 1.1.0:
 
-## Major features and improvements:
+## Major features and improvements
 
 * Added the following new datasets:
 
diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py
index 5c3838ceb..96aa32f85 100644
--- a/kedro-datasets/kedro_datasets/__init__.py
+++ b/kedro-datasets/kedro_datasets/__init__.py
@@ -1,3 +1,3 @@
 """``kedro_datasets`` is where you can find all of Kedro's data connectors."""
 
-__version__ = "1.2.0"
+__version__ = "1.3.0"

From c2a712853a10931b22e1cc7dfe0410a45135289a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?=
 <juan_luis_cano@mckinsey.com>
Date: Tue, 23 May 2023 23:38:56 +0200
Subject: [PATCH 92/96] docs: Fix APIDataSet docstring (#220)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix APIDataSet docstring

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Add release notes

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Separate [docs] extras from [all] in kedro-datasets

Fix gh-143.

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/RELEASE.md                     |  1 +
 .../kedro_datasets/api/api_dataset.py         |  4 ++--
 kedro-datasets/setup.py                       | 22 +++++++++----------
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
index ed347ca60..8406a063c 100644
--- a/kedro-datasets/RELEASE.md
+++ b/kedro-datasets/RELEASE.md
@@ -3,6 +3,7 @@
 ## Major features and improvements
 
 ## Bug fixes and other changes
+* Fixed problematic docstrings causing Read the Docs builds on Kedro to fail.
 
 ## Community contributions
 
diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
index 82bba3546..0929f56fe 100644
--- a/kedro-datasets/kedro_datasets/api/api_dataset.py
+++ b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -54,7 +54,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
         >>> data = data_set.load()
 
     ``APIDataSet`` can also be used to save output on a remote server using HTTP(S)
-    methods.
+    methods. ::
 
         >>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}'
 
@@ -116,7 +116,7 @@ def __init__(
 
         Raises:
             ValueError: if both ``auth`` and ``credentials`` are specified or used
-            unsupported RESTful API method.
+                unsupported RESTful API method.
         """
         super().__init__()
 
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index a154d8132..bc4ce794d 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -84,17 +84,6 @@ def _collect_requirements(requires):
     "biosequence": _collect_requirements(biosequence_require),
     "dask": _collect_requirements(dask_require),
     "databricks": _collect_requirements(databricks_require),
-    "docs": [
-        "docutils==0.16",
-        "sphinx~=3.4.3",
-        "sphinx_rtd_theme==0.4.1",
-        "nbsphinx==0.8.1",
-        "nbstripout~=0.4",
-        "sphinx-autodoc-typehints==1.11.1",
-        "sphinx_copybutton==0.3.1",
-        "ipykernel>=5.3, <7.0",
-        "myst-parser~=0.17.2",
-    ],
     "geopandas": _collect_requirements(geopandas_require),
     "holoviews": _collect_requirements(holoviews_require),
     "matplotlib": _collect_requirements(matplotlib_require),
@@ -131,6 +120,17 @@ def _collect_requirements(requires):
 }
 
 extras_require["all"] = _collect_requirements(extras_require)
+extras_require["docs"] = [
+    "docutils==0.16",
+    "sphinx~=3.4.3",
+    "sphinx_rtd_theme==0.4.1",
+    "nbsphinx==0.8.1",
+    "nbstripout~=0.4",
+    "sphinx-autodoc-typehints==1.11.1",
+    "sphinx_copybutton==0.3.1",
+    "ipykernel>=5.3, <7.0",
+    "myst-parser~=0.17.2",
+]
 
 setup(
     extras_require=extras_require,

From 64446dc56772d117d888a1d650727cc191cbef39 Mon Sep 17 00:00:00 2001
From: kuriantom369 <116743025+kuriantom369@users.noreply.github.com>
Date: Tue, 30 May 2023 09:40:55 +0100
Subject: [PATCH 93/96] Update
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py

Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/tests/spark/test_spark_streaming_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
index b4e1f0414..c4fb6c005 100644
--- a/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
+++ b/kedro-datasets/tests/spark/test_spark_streaming_dataset.py
@@ -41,7 +41,7 @@ def sample_spark_df_schema() -> StructType:
 
 @pytest.fixture
 def sample_spark_streaming_df(tmp_path, sample_spark_df_schema):
-    """Create s sample dataframe for streaming"""
+    """Create a sample dataframe for streaming"""
     data = [("0001", 2), ("0001", 7), ("0002", 4)]
     schema_path = (tmp_path / SCHEMA_FILE_NAME).as_posix()
     with open(schema_path, "w", encoding="utf-8") as f:

From 497001d5f3dcd1885814af0b143ded2aa9bef2c2 Mon Sep 17 00:00:00 2001
From: kuriantom369 <116743025+kuriantom369@users.noreply.github.com>
Date: Tue, 30 May 2023 09:41:52 +0100
Subject: [PATCH 94/96] Update
 kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py

Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index 0f7e841ed..b34f277f9 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -58,7 +58,7 @@ def __init__(
                 a list of read options for each supported format
                 in Spark DataFrame read documentation:
                 https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html,
-                Please note that a schema is mandatory for a streaming DataFrame if schemaInference
+                Please note that a schema is mandatory for a streaming DataFrame if ``schemaInference``
                 is not True.
             save_args: Save args passed to Spark DataFrame write options.
                 Similar to load_args this is dependent on the selected file

From 7f25f3c5b101c07165bac3ff42359291421cbfa4 Mon Sep 17 00:00:00 2001
From: kuriantom369 <116743025+kuriantom369@users.noreply.github.com>
Date: Tue, 30 May 2023 09:42:27 +0100
Subject: [PATCH 95/96] Update kedro-datasets/setup.py

Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 kedro-datasets/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
index bc4ce794d..210eb6884 100644
--- a/kedro-datasets/setup.py
+++ b/kedro-datasets/setup.py
@@ -51,7 +51,7 @@ def _collect_requirements(requires):
     "plotly.JSONDataSet": ["plotly>=4.8.0, <6.0"],
 }
 polars_require = {
-    "polars.CSVDataSet": [POLARS],
+    "polars.CSVDataSet": [POLARS]
 }
 redis_require = {"redis.PickleDataSet": ["redis~=4.1"]}
 snowflake_require = {

From c094db1246a4efe8d8be4c43e0c79a3e84b12ac1 Mon Sep 17 00:00:00 2001
From: Tom Kurian <tom_kurian@mckinsey.com>
Date: Tue, 30 May 2023 17:34:51 +0100
Subject: [PATCH 96/96] fix linting issue

Signed-off-by: Tom Kurian <tom_kurian@mckinsey.com>
---
 .../kedro_datasets/spark/spark_streaming_dataset.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
index b34f277f9..2f7743e65 100644
--- a/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
+++ b/kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py
@@ -58,8 +58,8 @@ def __init__(
                 a list of read options for each supported format
                 in Spark DataFrame read documentation:
                 https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html,
-                Please note that a schema is mandatory for a streaming DataFrame if ``schemaInference``
-                is not True.
+                Please note that a schema is mandatory for a streaming DataFrame
+                if ``schemaInference`` is not True.
             save_args: Save args passed to Spark DataFrame write options.
                 Similar to load_args this is dependent on the selected file
                 format. You can pass ``mode`` and ``partitionBy`` to specify